From 706b4fc47bbc608932d3b491ae19a3b9cde9497b Mon Sep 17 00:00:00 2001 From: Dimitry Andric Date: Fri, 17 Jan 2020 20:45:01 +0000 Subject: Vendor import of llvm-project master e26a78e70, the last commit before the llvmorg-11-init tag, from which release/10.x was branched. --- llvm/lib/Target/AArch64/AArch64FrameLowering.cpp | 420 +++++++++++++++++++---- 1 file changed, 346 insertions(+), 74 deletions(-) (limited to 'llvm/lib/Target/AArch64/AArch64FrameLowering.cpp') diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp index 68e1e6a30224..ea3e800a1ad2 100644 --- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp @@ -206,6 +206,11 @@ static unsigned estimateRSStackSizeLimit(MachineFunction &MF) { return DefaultSafeSPDisplacement; } +TargetStackID::Value +AArch64FrameLowering::getStackIDForScalableVectors() const { + return TargetStackID::SVEVector; +} + /// Returns the size of the entire SVE stackframe (calleesaves + spills). static StackOffset getSVEStackSize(const MachineFunction &MF) { const AArch64FunctionInfo *AFI = MF.getInfo(); @@ -222,7 +227,7 @@ bool AArch64FrameLowering::canUseRedZone(const MachineFunction &MF) const { const MachineFrameInfo &MFI = MF.getFrameInfo(); const AArch64FunctionInfo *AFI = MF.getInfo(); - unsigned NumBytes = AFI->getLocalStackSize(); + uint64_t NumBytes = AFI->getLocalStackSize(); return !(MFI.hasCalls() || hasFP(MF) || NumBytes > 128 || getSVEStackSize(MF)); @@ -239,7 +244,7 @@ bool AArch64FrameLowering::hasFP(const MachineFunction &MF) const { if (MF.hasEHFunclets()) return true; // Retain behavior of always omitting the FP for leaf functions when possible. - if (MFI.hasCalls() && MF.getTarget().Options.DisableFramePointerElim(MF)) + if (MF.getTarget().Options.DisableFramePointerElim(MF)) return true; if (MFI.hasVarSizedObjects() || MFI.isFrameAddressTaken() || MFI.hasStackMap() || MFI.hasPatchPoint() || @@ -424,7 +429,7 @@ bool AArch64FrameLowering::canUseAsPrologue( } static bool windowsRequiresStackProbe(MachineFunction &MF, - unsigned StackSizeInBytes) { + uint64_t StackSizeInBytes) { const AArch64Subtarget &Subtarget = MF.getSubtarget(); if (!Subtarget.isTargetWindows()) return false; @@ -441,15 +446,12 @@ static bool windowsRequiresStackProbe(MachineFunction &MF, } bool AArch64FrameLowering::shouldCombineCSRLocalStackBump( - MachineFunction &MF, unsigned StackBumpBytes) const { + MachineFunction &MF, uint64_t StackBumpBytes) const { AArch64FunctionInfo *AFI = MF.getInfo(); const MachineFrameInfo &MFI = MF.getFrameInfo(); const AArch64Subtarget &Subtarget = MF.getSubtarget(); const AArch64RegisterInfo *RegInfo = Subtarget.getRegisterInfo(); - if (MF.getFunction().hasOptSize()) - return false; - if (AFI->getLocalStackSize() == 0) return false; @@ -723,7 +725,7 @@ static MachineBasicBlock::iterator convertCalleeSaveRestoreToSPPrePostIncDec( // Fixup callee-save register save/restore instructions to take into account // combined SP bump by adding the local stack size to the stack offsets. static void fixupCalleeSaveRestoreStackOffset(MachineInstr &MI, - unsigned LocalStackSize, + uint64_t LocalStackSize, bool NeedsWinCFI, bool *HasWinCFI) { if (AArch64InstrInfo::isSEHInstruction(MI)) @@ -834,6 +836,24 @@ static bool isTargetDarwin(const MachineFunction &MF) { return MF.getSubtarget().isTargetDarwin(); } +static bool isTargetWindows(const MachineFunction &MF) { + return MF.getSubtarget().isTargetWindows(); +} + +// Convenience function to determine whether I is an SVE callee save. +static bool IsSVECalleeSave(MachineBasicBlock::iterator I) { + switch (I->getOpcode()) { + default: + return false; + case AArch64::STR_ZXI: + case AArch64::STR_PXI: + case AArch64::LDR_ZXI: + case AArch64::LDR_PXI: + return I->getFlag(MachineInstr::FrameSetup) || + I->getFlag(MachineInstr::FrameDestroy); + } +} + void AArch64FrameLowering::emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const { MachineBasicBlock::iterator MBBI = MBB.begin(); @@ -844,8 +864,8 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, const TargetInstrInfo *TII = Subtarget.getInstrInfo(); MachineModuleInfo &MMI = MF.getMMI(); AArch64FunctionInfo *AFI = MF.getInfo(); - bool needsFrameMoves = (MMI.hasDebugInfo() || F.needsUnwindTableEntry()) && - !MF.getTarget().getMCAsmInfo()->usesWindowsCFI(); + bool needsFrameMoves = + MF.needsFrameMoves() && !MF.getTarget().getMCAsmInfo()->usesWindowsCFI(); bool HasFP = hasFP(MF); bool NeedsWinCFI = needsWinCFI(MF); bool HasWinCFI = false; @@ -897,8 +917,8 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, // pointer from the funclet. We only save the callee saved registers in the // funclet, which are really the callee saved registers of the parent // function, including the funclet. - int NumBytes = IsFunclet ? (int)getWinEHFuncletFrameSize(MF) - : (int)MFI.getStackSize(); + int64_t NumBytes = IsFunclet ? getWinEHFuncletFrameSize(MF) + : MFI.getStackSize(); if (!AFI->hasStackFrame() && !windowsRequiresStackProbe(MF, NumBytes)) { assert(!HasFP && "unexpected function without stack frame but with FP"); assert(!SVEStackSize && @@ -916,15 +936,15 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP, {-NumBytes, MVT::i8}, TII, MachineInstr::FrameSetup, false, NeedsWinCFI, &HasWinCFI); - if (!NeedsWinCFI) { + if (!NeedsWinCFI && needsFrameMoves) { // Label used to tie together the PROLOG_LABEL and the MachineMoves. MCSymbol *FrameLabel = MMI.getContext().createTempSymbol(); - // Encode the stack size of the leaf function. - unsigned CFIIndex = MF.addFrameInst( - MCCFIInstruction::createDefCfaOffset(FrameLabel, -NumBytes)); - BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION)) - .addCFIIndex(CFIIndex) - .setMIFlags(MachineInstr::FrameSetup); + // Encode the stack size of the leaf function. + unsigned CFIIndex = MF.addFrameInst( + MCCFIInstruction::createDefCfaOffset(FrameLabel, -NumBytes)); + BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION)) + .addCFIIndex(CFIIndex) + .setMIFlags(MachineInstr::FrameSetup); } } @@ -965,7 +985,8 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, // and pre-inc if we decided to combine the callee-save and local stack // pointer bump above. MachineBasicBlock::iterator End = MBB.end(); - while (MBBI != End && MBBI->getFlag(MachineInstr::FrameSetup)) { + while (MBBI != End && MBBI->getFlag(MachineInstr::FrameSetup) && + !IsSVECalleeSave(MBBI)) { if (CombineSPBump) fixupCalleeSaveRestoreStackOffset(*MBBI, AFI->getLocalStackSize(), NeedsWinCFI, &HasWinCFI); @@ -999,7 +1020,7 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, if (HasFP) { // Only set up FP if we actually need to. - int FPOffset = isTargetDarwin(MF) ? (AFI->getCalleeSavedStackSize() - 16) : 0; + int64_t FPOffset = isTargetDarwin(MF) ? (AFI->getCalleeSavedStackSize() - 16) : 0; if (CombineSPBump) FPOffset += AFI->getLocalStackSize(); @@ -1014,7 +1035,7 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, } if (windowsRequiresStackProbe(MF, NumBytes)) { - uint32_t NumWords = NumBytes >> 4; + uint64_t NumWords = NumBytes >> 4; if (NeedsWinCFI) { HasWinCFI = true; // alloc_l can hold at most 256MB, so assume that NumBytes doesn't @@ -1107,7 +1128,35 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, NumBytes = 0; } - emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP, -SVEStackSize, TII, + StackOffset AllocateBefore = SVEStackSize, AllocateAfter = {}; + MachineBasicBlock::iterator CalleeSavesBegin = MBBI, CalleeSavesEnd = MBBI; + + // Process the SVE callee-saves to determine what space needs to be + // allocated. + if (AFI->getSVECalleeSavedStackSize()) { + // Find callee save instructions in frame. + CalleeSavesBegin = MBBI; + assert(IsSVECalleeSave(CalleeSavesBegin) && "Unexpected instruction"); + while (IsSVECalleeSave(MBBI) && MBBI != MBB.getFirstTerminator()) + ++MBBI; + CalleeSavesEnd = MBBI; + + int64_t OffsetToFirstCalleeSaveFromSP = + MFI.getObjectOffset(AFI->getMaxSVECSFrameIndex()); + StackOffset OffsetToCalleeSavesFromSP = + StackOffset(OffsetToFirstCalleeSaveFromSP, MVT::nxv1i8) + SVEStackSize; + AllocateBefore -= OffsetToCalleeSavesFromSP; + AllocateAfter = SVEStackSize - AllocateBefore; + } + + // Allocate space for the callee saves (if any). + emitFrameOffset(MBB, CalleeSavesBegin, DL, AArch64::SP, AArch64::SP, + -AllocateBefore, TII, + MachineInstr::FrameSetup); + + // Finally allocate remaining SVE stack space. + emitFrameOffset(MBB, CalleeSavesEnd, DL, AArch64::SP, AArch64::SP, + -AllocateAfter, TII, MachineInstr::FrameSetup); // Allocate space for the rest of the frame. @@ -1343,8 +1392,8 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF, IsFunclet = isFuncletReturnInstr(*MBBI); } - int NumBytes = IsFunclet ? (int)getWinEHFuncletFrameSize(MF) - : MFI.getStackSize(); + int64_t NumBytes = IsFunclet ? getWinEHFuncletFrameSize(MF) + : MFI.getStackSize(); AArch64FunctionInfo *AFI = MF.getInfo(); // All calls are tail calls in GHC calling conv, and functions have no @@ -1444,7 +1493,8 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF, MachineBasicBlock::iterator Begin = MBB.begin(); while (LastPopI != Begin) { --LastPopI; - if (!LastPopI->getFlag(MachineInstr::FrameDestroy)) { + if (!LastPopI->getFlag(MachineInstr::FrameDestroy) || + IsSVECalleeSave(LastPopI)) { ++LastPopI; break; } else if (CombineSPBump) @@ -1476,11 +1526,53 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF, NumBytes -= PrologueSaveSize; assert(NumBytes >= 0 && "Negative stack allocation size!?"); + // Process the SVE callee-saves to determine what space needs to be + // deallocated. + StackOffset DeallocateBefore = {}, DeallocateAfter = SVEStackSize; + MachineBasicBlock::iterator RestoreBegin = LastPopI, RestoreEnd = LastPopI; + if (AFI->getSVECalleeSavedStackSize()) { + RestoreBegin = std::prev(RestoreEnd);; + while (IsSVECalleeSave(RestoreBegin) && + RestoreBegin != MBB.begin()) + --RestoreBegin; + ++RestoreBegin; + + assert(IsSVECalleeSave(RestoreBegin) && + IsSVECalleeSave(std::prev(RestoreEnd)) && "Unexpected instruction"); + + int64_t OffsetToFirstCalleeSaveFromSP = + MFI.getObjectOffset(AFI->getMaxSVECSFrameIndex()); + StackOffset OffsetToCalleeSavesFromSP = + StackOffset(OffsetToFirstCalleeSaveFromSP, MVT::nxv1i8) + SVEStackSize; + DeallocateBefore = OffsetToCalleeSavesFromSP; + DeallocateAfter = SVEStackSize - DeallocateBefore; + } + // Deallocate the SVE area. - if (SVEStackSize) - if (!AFI->isStackRealigned()) - emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::SP, SVEStackSize, - TII, MachineInstr::FrameDestroy); + if (SVEStackSize) { + if (AFI->isStackRealigned()) { + if (AFI->getSVECalleeSavedStackSize()) + // Set SP to start of SVE area, from which the callee-save reloads + // can be done. The code below will deallocate the stack space + // space by moving FP -> SP. + emitFrameOffset(MBB, RestoreBegin, DL, AArch64::SP, AArch64::FP, + -SVEStackSize, TII, MachineInstr::FrameDestroy); + } else { + if (AFI->getSVECalleeSavedStackSize()) { + // Deallocate the non-SVE locals first before we can deallocate (and + // restore callee saves) from the SVE area. + emitFrameOffset(MBB, RestoreBegin, DL, AArch64::SP, AArch64::SP, + {NumBytes, MVT::i8}, TII, MachineInstr::FrameDestroy); + NumBytes = 0; + } + + emitFrameOffset(MBB, RestoreBegin, DL, AArch64::SP, AArch64::SP, + DeallocateBefore, TII, MachineInstr::FrameDestroy); + + emitFrameOffset(MBB, RestoreEnd, DL, AArch64::SP, AArch64::SP, + DeallocateAfter, TII, MachineInstr::FrameDestroy); + } + } if (!hasFP(MF)) { bool RedZone = canUseRedZone(MF); @@ -1490,7 +1582,7 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF, return; bool NoCalleeSaveRestore = PrologueSaveSize == 0; - int StackRestoreBytes = RedZone ? 0 : NumBytes; + int64_t StackRestoreBytes = RedZone ? 0 : NumBytes; if (NoCalleeSaveRestore) StackRestoreBytes += AfterCSRPopSize; @@ -1582,19 +1674,20 @@ int AArch64FrameLowering::getNonLocalFrameIndexReference( return getSEHFrameIndexOffset(MF, FI); } -static StackOffset getFPOffset(const MachineFunction &MF, int ObjectOffset) { +static StackOffset getFPOffset(const MachineFunction &MF, int64_t ObjectOffset) { const auto *AFI = MF.getInfo(); const auto &Subtarget = MF.getSubtarget(); bool IsWin64 = Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv()); unsigned FixedObject = IsWin64 ? alignTo(AFI->getVarArgsGPRSize(), 16) : 0; - unsigned FPAdjust = isTargetDarwin(MF) ? 16 : AFI->getCalleeSavedStackSize(); + unsigned FPAdjust = isTargetDarwin(MF) + ? 16 : AFI->getCalleeSavedStackSize(MF.getFrameInfo()); return {ObjectOffset + FixedObject + FPAdjust, MVT::i8}; } -static StackOffset getStackOffset(const MachineFunction &MF, int ObjectOffset) { +static StackOffset getStackOffset(const MachineFunction &MF, int64_t ObjectOffset) { const auto &MFI = MF.getFrameInfo(); - return {ObjectOffset + (int)MFI.getStackSize(), MVT::i8}; + return {ObjectOffset + (int64_t)MFI.getStackSize(), MVT::i8}; } int AArch64FrameLowering::getSEHFrameIndexOffset(const MachineFunction &MF, @@ -1611,7 +1704,7 @@ StackOffset AArch64FrameLowering::resolveFrameIndexReference( const MachineFunction &MF, int FI, unsigned &FrameReg, bool PreferFP, bool ForSimm) const { const auto &MFI = MF.getFrameInfo(); - int ObjectOffset = MFI.getObjectOffset(FI); + int64_t ObjectOffset = MFI.getObjectOffset(FI); bool isFixed = MFI.isFixedObjectIndex(FI); bool isSVE = MFI.getStackID(FI) == TargetStackID::SVEVector; return resolveFrameOffsetReference(MF, ObjectOffset, isFixed, isSVE, FrameReg, @@ -1619,7 +1712,7 @@ StackOffset AArch64FrameLowering::resolveFrameIndexReference( } StackOffset AArch64FrameLowering::resolveFrameOffsetReference( - const MachineFunction &MF, int ObjectOffset, bool isFixed, bool isSVE, + const MachineFunction &MF, int64_t ObjectOffset, bool isFixed, bool isSVE, unsigned &FrameReg, bool PreferFP, bool ForSimm) const { const auto &MFI = MF.getFrameInfo(); const auto *RegInfo = static_cast( @@ -1627,10 +1720,10 @@ StackOffset AArch64FrameLowering::resolveFrameOffsetReference( const auto *AFI = MF.getInfo(); const auto &Subtarget = MF.getSubtarget(); - int FPOffset = getFPOffset(MF, ObjectOffset).getBytes(); - int Offset = getStackOffset(MF, ObjectOffset).getBytes(); + int64_t FPOffset = getFPOffset(MF, ObjectOffset).getBytes(); + int64_t Offset = getStackOffset(MF, ObjectOffset).getBytes(); bool isCSR = - !isFixed && ObjectOffset >= -((int)AFI->getCalleeSavedStackSize()); + !isFixed && ObjectOffset >= -((int)AFI->getCalleeSavedStackSize(MFI)); const StackOffset &SVEStackSize = getSVEStackSize(MF); @@ -1781,6 +1874,8 @@ static bool invalidateWindowsRegisterPairing(unsigned Reg1, unsigned Reg2, // TODO: LR can be paired with any register. We don't support this yet in // the MCLayer. We need to add support for the save_lrpair unwind code. + if (Reg2 == AArch64::FP) + return true; if (!NeedsWinCFI) return false; if (Reg2 == Reg1 + 1) @@ -1793,9 +1888,9 @@ static bool invalidateWindowsRegisterPairing(unsigned Reg1, unsigned Reg2, /// LR and FP need to be allocated together when the frame needs to save /// the frame-record. This means any other register pairing with LR is invalid. static bool invalidateRegisterPairing(unsigned Reg1, unsigned Reg2, - bool NeedsWinCFI, bool NeedsFrameRecord) { - if (NeedsWinCFI) - return invalidateWindowsRegisterPairing(Reg1, Reg2, true); + bool UsesWinAAPCS, bool NeedsWinCFI, bool NeedsFrameRecord) { + if (UsesWinAAPCS) + return invalidateWindowsRegisterPairing(Reg1, Reg2, NeedsWinCFI); // If we need to store the frame record, don't pair any register // with LR other than FP. @@ -1812,11 +1907,27 @@ struct RegPairInfo { unsigned Reg2 = AArch64::NoRegister; int FrameIdx; int Offset; - enum RegType { GPR, FPR64, FPR128 } Type; + enum RegType { GPR, FPR64, FPR128, PPR, ZPR } Type; RegPairInfo() = default; bool isPaired() const { return Reg2 != AArch64::NoRegister; } + + unsigned getScale() const { + switch (Type) { + case PPR: + return 2; + case GPR: + case FPR64: + return 8; + case ZPR: + case FPR128: + return 16; + } + llvm_unreachable("Unsupported type"); + } + + bool isScalable() const { return Type == PPR || Type == ZPR; } }; } // end anonymous namespace @@ -1829,6 +1940,7 @@ static void computeCalleeSaveRegisterPairs( if (CSI.empty()) return; + bool IsWindows = isTargetWindows(MF); bool NeedsWinCFI = needsWinCFI(MF); AArch64FunctionInfo *AFI = MF.getInfo(); MachineFrameInfo &MFI = MF.getFrameInfo(); @@ -1841,7 +1953,8 @@ static void computeCalleeSaveRegisterPairs( CC == CallingConv::PreserveMost || (Count & 1) == 0) && "Odd number of callee-saved regs to spill!"); - int Offset = AFI->getCalleeSavedStackSize(); + int ByteOffset = AFI->getCalleeSavedStackSize(); + int ScalableByteOffset = AFI->getSVECalleeSavedStackSize(); // On Linux, we will have either one or zero non-paired register. On Windows // with CFI, we can have multiple unpaired registers in order to utilize the // available unwind codes. This flag assures that the alignment fixup is done @@ -1857,6 +1970,10 @@ static void computeCalleeSaveRegisterPairs( RPI.Type = RegPairInfo::FPR64; else if (AArch64::FPR128RegClass.contains(RPI.Reg1)) RPI.Type = RegPairInfo::FPR128; + else if (AArch64::ZPRRegClass.contains(RPI.Reg1)) + RPI.Type = RegPairInfo::ZPR; + else if (AArch64::PPRRegClass.contains(RPI.Reg1)) + RPI.Type = RegPairInfo::PPR; else llvm_unreachable("Unsupported register class."); @@ -1866,7 +1983,7 @@ static void computeCalleeSaveRegisterPairs( switch (RPI.Type) { case RegPairInfo::GPR: if (AArch64::GPR64RegClass.contains(NextReg) && - !invalidateRegisterPairing(RPI.Reg1, NextReg, NeedsWinCFI, + !invalidateRegisterPairing(RPI.Reg1, NextReg, IsWindows, NeedsWinCFI, NeedsFrameRecord)) RPI.Reg2 = NextReg; break; @@ -1879,6 +1996,9 @@ static void computeCalleeSaveRegisterPairs( if (AArch64::FPR128RegClass.contains(NextReg)) RPI.Reg2 = NextReg; break; + case RegPairInfo::PPR: + case RegPairInfo::ZPR: + break; } } @@ -1905,6 +2025,11 @@ static void computeCalleeSaveRegisterPairs( RPI.Reg1 == AArch64::LR) && "FrameRecord must be allocated together with LR"); + // Windows AAPCS has FP and LR reversed. + assert((!RPI.isPaired() || !NeedsFrameRecord || RPI.Reg1 != AArch64::FP || + RPI.Reg2 == AArch64::LR) && + "FrameRecord must be allocated together with LR"); + // MachO's compact unwind format relies on all registers being stored in // adjacent register pairs. assert((!produceCompactUnwindFrame(MF) || @@ -1916,23 +2041,33 @@ static void computeCalleeSaveRegisterPairs( RPI.FrameIdx = CSI[i].getFrameIdx(); - int Scale = RPI.Type == RegPairInfo::FPR128 ? 16 : 8; - Offset -= RPI.isPaired() ? 2 * Scale : Scale; + int Scale = RPI.getScale(); + if (RPI.isScalable()) + ScalableByteOffset -= Scale; + else + ByteOffset -= RPI.isPaired() ? 2 * Scale : Scale; + + assert(!(RPI.isScalable() && RPI.isPaired()) && + "Paired spill/fill instructions don't exist for SVE vectors"); // Round up size of non-pair to pair size if we need to pad the // callee-save area to ensure 16-byte alignment. if (AFI->hasCalleeSaveStackFreeSpace() && !FixupDone && - RPI.Type != RegPairInfo::FPR128 && !RPI.isPaired()) { + !RPI.isScalable() && RPI.Type != RegPairInfo::FPR128 && + !RPI.isPaired()) { FixupDone = true; - Offset -= 8; - assert(Offset % 16 == 0); + ByteOffset -= 8; + assert(ByteOffset % 16 == 0); assert(MFI.getObjectAlignment(RPI.FrameIdx) <= 16); MFI.setObjectAlignment(RPI.FrameIdx, 16); } + int Offset = RPI.isScalable() ? ScalableByteOffset : ByteOffset; assert(Offset % Scale == 0); RPI.Offset = Offset / Scale; - assert((RPI.Offset >= -64 && RPI.Offset <= 63) && + + assert(((!RPI.isScalable() && RPI.Offset >= -64 && RPI.Offset <= 63) || + (RPI.isScalable() && RPI.Offset >= -256 && RPI.Offset <= 255)) && "Offset out of bounds for LDP/STP immediate"); RegPairs.push_back(RPI); @@ -2024,6 +2159,16 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters( Size = 16; Align = 16; break; + case RegPairInfo::ZPR: + StrOpc = AArch64::STR_ZXI; + Size = 16; + Align = 16; + break; + case RegPairInfo::PPR: + StrOpc = AArch64::STR_PXI; + Size = 2; + Align = 2; + break; } LLVM_DEBUG(dbgs() << "CSR spill: (" << printReg(Reg1, TRI); if (RPI.isPaired()) dbgs() << ", " << printReg(Reg2, TRI); @@ -2064,6 +2209,11 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters( if (NeedsWinCFI) InsertSEH(MIB, TII, MachineInstr::FrameSetup); + // Update the StackIDs of the SVE stack slots. + MachineFrameInfo &MFI = MF.getFrameInfo(); + if (RPI.Type == RegPairInfo::ZPR || RPI.Type == RegPairInfo::PPR) + MFI.setStackID(RPI.FrameIdx, TargetStackID::SVEVector); + } return true; } @@ -2115,6 +2265,16 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters( Size = 16; Align = 16; break; + case RegPairInfo::ZPR: + LdrOpc = AArch64::LDR_ZXI; + Size = 16; + Align = 16; + break; + case RegPairInfo::PPR: + LdrOpc = AArch64::LDR_PXI; + Size = 2; + Align = 2; + break; } LLVM_DEBUG(dbgs() << "CSR restore: (" << printReg(Reg1, TRI); if (RPI.isPaired()) dbgs() << ", " << printReg(Reg2, TRI); @@ -2149,12 +2309,20 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters( if (NeedsWinCFI) InsertSEH(MIB, TII, MachineInstr::FrameDestroy); }; - if (ReverseCSRRestoreSeq) - for (const RegPairInfo &RPI : reverse(RegPairs)) + + // SVE objects are always restored in reverse order. + for (const RegPairInfo &RPI : reverse(RegPairs)) + if (RPI.isScalable()) EmitMI(RPI); - else + + if (ReverseCSRRestoreSeq) { + for (const RegPairInfo &RPI : reverse(RegPairs)) + if (!RPI.isScalable()) + EmitMI(RPI); + } else for (const RegPairInfo &RPI : RegPairs) - EmitMI(RPI); + if (!RPI.isScalable()) + EmitMI(RPI); if (NeedShadowCallStackProlog) { // Shadow call stack epilog: ldr x30, [x18, #-8]! @@ -2201,7 +2369,12 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF, SavedRegs.set(Reg); bool RegUsed = SavedRegs.test(Reg); - unsigned PairedReg = CSRegs[i ^ 1]; + unsigned PairedReg = AArch64::NoRegister; + if (AArch64::GPR64RegClass.contains(Reg) || + AArch64::FPR64RegClass.contains(Reg) || + AArch64::FPR128RegClass.contains(Reg)) + PairedReg = CSRegs[i ^ 1]; + if (!RegUsed) { if (AArch64::GPR64RegClass.contains(Reg) && !RegInfo->isReservedReg(MF, Reg)) { @@ -2225,16 +2398,23 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF, // Calculates the callee saved stack size. unsigned CSStackSize = 0; + unsigned SVECSStackSize = 0; const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); const MachineRegisterInfo &MRI = MF.getRegInfo(); - for (unsigned Reg : SavedRegs.set_bits()) - CSStackSize += TRI->getRegSizeInBits(Reg, MRI) / 8; + for (unsigned Reg : SavedRegs.set_bits()) { + auto RegSize = TRI->getRegSizeInBits(Reg, MRI) / 8; + if (AArch64::PPRRegClass.contains(Reg) || + AArch64::ZPRRegClass.contains(Reg)) + SVECSStackSize += RegSize; + else + CSStackSize += RegSize; + } // Save number of saved regs, so we can easily update CSStackSize later. unsigned NumSavedRegs = SavedRegs.count(); // The frame record needs to be created by saving the appropriate registers - unsigned EstimatedStackSize = MFI.estimateStackSize(MF); + uint64_t EstimatedStackSize = MFI.estimateStackSize(MF); if (hasFP(MF) || windowsRequiresStackProbe(MF, EstimatedStackSize + CSStackSize + 16)) { SavedRegs.set(AArch64::FP); @@ -2248,10 +2428,8 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF, dbgs() << "\n";); // If any callee-saved registers are used, the frame cannot be eliminated. - unsigned MaxAlign = getStackAlignment(); int64_t SVEStackSize = - alignTo(determineSVEStackSize(MFI, MaxAlign), MaxAlign); - assert(MaxAlign <= 16 && "Cannot align scalable vectors more than 16 bytes"); + alignTo(SVECSStackSize + estimateSVEStackObjectOffsets(MFI), 16); bool CanEliminateFrame = (SavedRegs.count() == 0) && !SVEStackSize; // The CSR spill slots have not been allocated yet, so estimateStackSize @@ -2299,15 +2477,20 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF, // Adding the size of additional 64bit GPR saves. CSStackSize += 8 * (SavedRegs.count() - NumSavedRegs); - unsigned AlignedCSStackSize = alignTo(CSStackSize, 16); + uint64_t AlignedCSStackSize = alignTo(CSStackSize, 16); LLVM_DEBUG(dbgs() << "Estimated stack frame size: " << EstimatedStackSize + AlignedCSStackSize << " bytes.\n"); + assert((!MFI.isCalleeSavedInfoValid() || + AFI->getCalleeSavedStackSize() == AlignedCSStackSize) && + "Should not invalidate callee saved info"); + // Round up to register pair alignment to avoid additional SP adjustment // instructions. AFI->setCalleeSavedStackSize(AlignedCSStackSize); AFI->setCalleeSaveStackHasFreeSpace(AlignedCSStackSize != CSStackSize); + AFI->setSVECalleeSavedStackSize(alignTo(SVECSStackSize, 16)); } bool AArch64FrameLowering::enableStackSlotScavenging( @@ -2316,9 +2499,40 @@ bool AArch64FrameLowering::enableStackSlotScavenging( return AFI->hasCalleeSaveStackFreeSpace(); } -int64_t AArch64FrameLowering::determineSVEStackSize(MachineFrameInfo &MFI, - unsigned &MaxAlign) const { - // Process all fixed stack objects. +/// returns true if there are any SVE callee saves. +static bool getSVECalleeSaveSlotRange(const MachineFrameInfo &MFI, + int &Min, int &Max) { + Min = std::numeric_limits::max(); + Max = std::numeric_limits::min(); + + if (!MFI.isCalleeSavedInfoValid()) + return false; + + const std::vector &CSI = MFI.getCalleeSavedInfo(); + for (auto &CS : CSI) { + if (AArch64::ZPRRegClass.contains(CS.getReg()) || + AArch64::PPRRegClass.contains(CS.getReg())) { + assert((Max == std::numeric_limits::min() || + Max + 1 == CS.getFrameIdx()) && + "SVE CalleeSaves are not consecutive"); + + Min = std::min(Min, CS.getFrameIdx()); + Max = std::max(Max, CS.getFrameIdx()); + } + } + return Min != std::numeric_limits::max(); +} + +// Process all the SVE stack objects and determine offsets for each +// object. If AssignOffsets is true, the offsets get assigned. +// Fills in the first and last callee-saved frame indices into +// Min/MaxCSFrameIndex, respectively. +// Returns the size of the stack. +static int64_t determineSVEStackObjectOffsets(MachineFrameInfo &MFI, + int &MinCSFrameIndex, + int &MaxCSFrameIndex, + bool AssignOffsets) { + // First process all fixed stack objects. int64_t Offset = 0; for (int I = MFI.getObjectIndexBegin(); I != 0; ++I) if (MFI.getStackID(I) == TargetStackID::SVEVector) { @@ -2327,12 +2541,69 @@ int64_t AArch64FrameLowering::determineSVEStackSize(MachineFrameInfo &MFI, Offset = FixedOffset; } - // Note: We don't take allocatable stack objects into - // account yet, because allocation for those is not yet - // implemented. + auto Assign = [&MFI](int FI, int64_t Offset) { + LLVM_DEBUG(dbgs() << "alloc FI(" << FI << ") at SP[" << Offset << "]\n"); + MFI.setObjectOffset(FI, Offset); + }; + + // Then process all callee saved slots. + if (getSVECalleeSaveSlotRange(MFI, MinCSFrameIndex, MaxCSFrameIndex)) { + // Make sure to align the last callee save slot. + MFI.setObjectAlignment(MaxCSFrameIndex, 16U); + + // Assign offsets to the callee save slots. + for (int I = MinCSFrameIndex; I <= MaxCSFrameIndex; ++I) { + Offset += MFI.getObjectSize(I); + Offset = alignTo(Offset, MFI.getObjectAlignment(I)); + if (AssignOffsets) + Assign(I, -Offset); + } + } + + // Create a buffer of SVE objects to allocate and sort it. + SmallVector ObjectsToAllocate; + for (int I = 0, E = MFI.getObjectIndexEnd(); I != E; ++I) { + unsigned StackID = MFI.getStackID(I); + if (StackID != TargetStackID::SVEVector) + continue; + if (MaxCSFrameIndex >= I && I >= MinCSFrameIndex) + continue; + if (MFI.isDeadObjectIndex(I)) + continue; + + ObjectsToAllocate.push_back(I); + } + + // Allocate all SVE locals and spills + for (unsigned FI : ObjectsToAllocate) { + unsigned Align = MFI.getObjectAlignment(FI); + // FIXME: Given that the length of SVE vectors is not necessarily a power of + // two, we'd need to align every object dynamically at runtime if the + // alignment is larger than 16. This is not yet supported. + if (Align > 16) + report_fatal_error( + "Alignment of scalable vectors > 16 bytes is not yet supported"); + + Offset = alignTo(Offset + MFI.getObjectSize(FI), Align); + if (AssignOffsets) + Assign(FI, -Offset); + } + return Offset; } +int64_t AArch64FrameLowering::estimateSVEStackObjectOffsets( + MachineFrameInfo &MFI) const { + int MinCSFrameIndex, MaxCSFrameIndex; + return determineSVEStackObjectOffsets(MFI, MinCSFrameIndex, MaxCSFrameIndex, false); +} + +int64_t AArch64FrameLowering::assignSVEStackObjectOffsets( + MachineFrameInfo &MFI, int &MinCSFrameIndex, int &MaxCSFrameIndex) const { + return determineSVEStackObjectOffsets(MFI, MinCSFrameIndex, MaxCSFrameIndex, + true); +} + void AArch64FrameLowering::processFunctionBeforeFrameFinalized( MachineFunction &MF, RegScavenger *RS) const { MachineFrameInfo &MFI = MF.getFrameInfo(); @@ -2340,12 +2611,13 @@ void AArch64FrameLowering::processFunctionBeforeFrameFinalized( assert(getStackGrowthDirection() == TargetFrameLowering::StackGrowsDown && "Upwards growing stack unsupported"); - unsigned MaxAlign = getStackAlignment(); - int64_t SVEStackSize = determineSVEStackSize(MFI, MaxAlign); + int MinCSFrameIndex, MaxCSFrameIndex; + int64_t SVEStackSize = + assignSVEStackObjectOffsets(MFI, MinCSFrameIndex, MaxCSFrameIndex); AArch64FunctionInfo *AFI = MF.getInfo(); - AFI->setStackSizeSVE(alignTo(SVEStackSize, MaxAlign)); - assert(MaxAlign <= 16 && "Cannot align scalable vectors more than 16 bytes"); + AFI->setStackSizeSVE(alignTo(SVEStackSize, 16U)); + AFI->setMinMaxSVECSFrameIndex(MinCSFrameIndex, MaxCSFrameIndex); // If this function isn't doing Win64-style C++ EH, we don't need to do // anything. -- cgit v1.2.3