diff --git a/src/coreclr/jit/codegen.h b/src/coreclr/jit/codegen.h index 30df97543bec50..d407407bba814a 100644 --- a/src/coreclr/jit/codegen.h +++ b/src/coreclr/jit/codegen.h @@ -600,8 +600,9 @@ class CodeGen final : public CodeGenInterface #ifdef TARGET_AMD64 void genPushCalleeSavedRegistersFromMaskAPX(regMaskTP rsPushRegs); unsigned genPopCalleeSavedRegistersFromMaskAPX(regMaskTP rsPopRegs); + bool genSecondFramePtrIsProfitable(); #endif // TARGET_AMD64 -#endif // !defined(TARGET_XARCH) +#endif // defined(TARGET_XARCH) #endif // !defined(TARGET_ARM64) diff --git a/src/coreclr/jit/codegencommon.cpp b/src/coreclr/jit/codegencommon.cpp index 6cc7cb9874d6c8..2c863b0285cd76 100644 --- a/src/coreclr/jit/codegencommon.cpp +++ b/src/coreclr/jit/codegencommon.cpp @@ -4791,6 +4791,12 @@ void CodeGen::genFinalizeFrame() } #endif // TARGET_ARM +#if defined(TARGET_AMD64) + // The secondary frame-pointer register is reserved as a candidate during LSRA (it is in + // regSet.rsMaskResvd). Unlike ARM, do NOT mark it modified here: whether it is actually pushed is + // decided below by genSecondFramePtrIsProfitable, once FINAL offsets are known. +#endif // TARGET_AMD64 + #ifdef TARGET_ARM64 if (m_compiler->IsTargetAbi(CORINFO_NATIVEAOT_ABI) && TargetOS::IsApplePlatform) { @@ -4993,6 +4999,37 @@ void CodeGen::genFinalizeFrame() m_compiler->lvaAssignFrameOffsets(Compiler::FINAL_FRAME_LAYOUT); +#if defined(TARGET_AMD64) + // A secondary frame pointer was reserved as a candidate during LSRA. Now that FINAL offsets are + // known, decide whether establishing it pays off. These offsets assume the register is NOT pushed; + // pushing it only shifts RBP-relative locals deeper and leaves RSP-relative locals unchanged, so + // scanning them is a sound, conservative test. If profitable, mark it modified (so prolog/epilog + // save and restore it) and redo the layout to account for the push; otherwise cancel the + // reservation so no push/lea or unwind data is emitted. + if (genSecondFramePtrReg != REG_NA) + { + if (genSecondFramePtrIsProfitable()) + { + // Reset the layout state first: rsSetRegsModified forbids adding a callee-saved register + // once FINAL layout is complete, and we are about to redo the layout for the push anyway. + m_compiler->lvaDoneFrameLayout = Compiler::REGALLOC_FRAME_LAYOUT; + + regSet.rsSetRegsModified(genRegMask(genSecondFramePtrReg)); + + regMaskTP maskCalleeRegsPushed = regSet.rsGetModifiedCalleeSavedRegsMask(); + maskCalleeRegsPushed &= ~RBM_FLT_CALLEE_SAVED; + m_compiler->compCalleeRegsPushed = genCountBits(maskCalleeRegsPushed); + + m_compiler->lvaAssignFrameOffsets(Compiler::FINAL_FRAME_LAYOUT); + } + else + { + JITDUMP("Cancelling secondary frame pointer: no local lands in the secondary disp8 band\n"); + genSecondFramePtrReg = REG_NA; + } + } +#endif // TARGET_AMD64 + #ifdef DEBUG if (m_compiler->opts.dspCode || m_compiler->opts.disAsm || m_compiler->opts.disAsm2 || verbose) { @@ -5001,6 +5038,72 @@ void CodeGen::genFinalizeFrame() #endif } +#if defined(TARGET_AMD64) +//------------------------------------------------------------------------ +// genSecondFramePtrIsProfitable: decide whether the reserved secondary frame-pointer register is +// worth establishing, given FINAL frame offsets. +// +// Return Value: +// true if some on-frame local has an access that needs a disp32 off the primary base but fits a +// disp8 off the secondary base; false otherwise. +// +// Notes: +// Mirrors emitter::emitIsSecondFramePtrCandidate's band test, so must run after FINAL offsets are +// assigned. This is necessary but not sufficient: it finds at least one redirectable access but does +// not count them, so a method with only one or two redirects can still regress a few bytes (each +// redirect saves 3 bytes, while setup costs a push + lea, a pop per epilog, and an unwind code). +// Counting sites would need an IR walk (MinOpts has no precise ref counts), not worth the Tier0 cost. +// +bool CodeGen::genSecondFramePtrIsProfitable() +{ + assert(genSecondFramePtrReg != REG_NA); + assert(m_compiler->lvaDoneFrameLayout == Compiler::FINAL_FRAME_LAYOUT); + + const bool wantFPbased = genSecondFramePtrFPbased; + const int offset = genSecondFramePtrOffset; + + // A redirect applies when the raw displacement does NOT fit a disp8 but the adjusted displacement + // (dsp +/- offset) does. Precompute the range of raw displacements that fit a disp8 once adjusted. + const int adjFitLo = wantFPbased ? (-128 - offset) : (-128 + offset); + const int adjFitHi = wantFPbased ? (127 - offset) : (127 + offset); + + for (unsigned varNum = 0; varNum < m_compiler->lvaCount; varNum++) + { + const LclVarDsc* const varDsc = m_compiler->lvaGetDesc(varNum); + if (!varDsc->lvOnFrame || m_compiler->lvaIsUnknownSizeLocal(varNum)) + { + continue; + } + + bool fpBased; + const int loDsp = m_compiler->lvaFrameAddress((int)varNum, &fpBased); + if (fpBased != wantFPbased) + { + continue; + } + + // Accesses to this local span [loDsp, hiDsp] (base slot plus any field/element offset). + const int hiDsp = loDsp + (int)m_compiler->lvaLclStackHomeSize(varNum) - 1; + + // Intersect that range with the adjusted-fits-disp8 range. + const int interLo = (loDsp > adjFitLo) ? loDsp : adjFitLo; + const int interHi = (hiDsp < adjFitHi) ? hiDsp : adjFitHi; + if (interLo > interHi) + { + continue; + } + + // The redirect only helps where the raw displacement itself needs a disp32 (|dsp| > 127). + if ((interLo < -128) || (interHi > 127)) + { + return true; + } + } + + return false; +} +#endif // TARGET_AMD64 + /***************************************************************************** * * Generates code for a function prolog. @@ -5554,6 +5657,21 @@ void CodeGen::genFnProlog() // //------------------------------------------------------------------------- +#if defined(TARGET_AMD64) + // Establish the secondary frame-pointer register. This runs after the frame pointer (if any) and + // after SP is final, so both candidate bases are live. It sits after the OS-reported prolog: the + // register was already saved (with its own unwind code) by genPushCalleeSavedRegisters, so this lea + // just loads a derived address and needs no unwind data. The register is out of allocation, so it + // stays live for the method body. + if (genSecondFramePtrReg != REG_NA) + { + const regNumber base = genSecondFramePtrFPbased ? REG_FPBASE : REG_SPBASE; + const int disp = genSecondFramePtrFPbased ? -genSecondFramePtrOffset : genSecondFramePtrOffset; + GetEmitter()->emitIns_R_AR(INS_lea, EA_PTRSIZE, genSecondFramePtrReg, base, disp); + regSet.verifyRegUsed(genSecondFramePtrReg); + } +#endif // TARGET_AMD64 + #ifdef TARGET_ARM64 if (m_compiler->compUsesUnknownSizeFrame) { diff --git a/src/coreclr/jit/codegeninterface.h b/src/coreclr/jit/codegeninterface.h index e696c886cd9e28..fc02b9a1d2bcee 100644 --- a/src/coreclr/jit/codegeninterface.h +++ b/src/coreclr/jit/codegeninterface.h @@ -282,6 +282,16 @@ class CodeGenInterface m_cgFrameRequired = value; } +#ifdef TARGET_AMD64 + // Secondary stack base pointer (see JitSecondFramePtr). When set, this callee-saved register holds + // (primaryBase +/- genSecondFramePtrOffset) and addresses far locals with a disp8 displacement; + // REG_NA means off. genSecondFramePtrFPbased tells whether it shadows RBP (locals at negative + // offsets) or RSP (positive); only accesses on that base are redirected. + regNumber genSecondFramePtrReg = REG_NA; + int genSecondFramePtrOffset = 0; + bool genSecondFramePtrFPbased = false; +#endif // TARGET_AMD64 + #if !HAS_FIXED_REGISTER_SET void SetStackPointerReg(unsigned funcletIndex, regNumber reg); diff --git a/src/coreclr/jit/codegenxarch.cpp b/src/coreclr/jit/codegenxarch.cpp index 4cc36d3133fde0..877b4a16faf689 100644 --- a/src/coreclr/jit/codegenxarch.cpp +++ b/src/coreclr/jit/codegenxarch.cpp @@ -10948,6 +10948,18 @@ void CodeGen::genFuncletProlog(BasicBlock* block) // This is the end of the OS-reported prolog for purposes of unwinding m_compiler->unwindEndProlog(); + // Re-establish the secondary frame-pointer register, but only in FILTER funclets. Catch/finally/ + // fault funclets are entered via CallEHFunclet, which restores all nonvolatiles (including RBX) from + // the establisher CONTEXT, so RBX already holds RBP - offset. Filter funclets use CallEHFilterFunclet, + // which restores only RBP, so RBX must be recomputed. EH methods always use an RBP frame and the + // funclet shares the parent frame via RBP, so the base is always RBP. + if ((genSecondFramePtrReg != REG_NA) && (m_compiler->funCurrentFunc()->funKind == FUNC_FILTER)) + { + assert(genSecondFramePtrFPbased); + GetEmitter()->emitIns_R_AR(INS_lea, EA_PTRSIZE, genSecondFramePtrReg, REG_FPBASE, -genSecondFramePtrOffset); + regSet.verifyRegUsed(genSecondFramePtrReg); + } + genClearAvxStateInProlog(); } diff --git a/src/coreclr/jit/emit.cpp b/src/coreclr/jit/emit.cpp index 7744c36d4b9e13..7822c699c59189 100644 --- a/src/coreclr/jit/emit.cpp +++ b/src/coreclr/jit/emit.cpp @@ -1341,6 +1341,9 @@ void emitter::emitBegFN(bool hasFramePtr emitHasFramePtr = hasFramePtr; +#if defined(TARGET_AMD64) + emitSecondFramePtrActive = (codeGen->genSecondFramePtrReg != REG_NA); +#endif #ifdef DEBUG emitChkAlign = chkAlign; #endif diff --git a/src/coreclr/jit/emit.h b/src/coreclr/jit/emit.h index 014d5f962e3013..acbcb73aa1d724 100644 --- a/src/coreclr/jit/emit.h +++ b/src/coreclr/jit/emit.h @@ -2628,7 +2628,7 @@ class emitter void emitDispGCinfo(); void emitDispJumpList(); void emitDispClsVar(CORINFO_FIELD_HANDLE fldHnd, ssize_t offs, bool reloc = false); - void emitDispFrameRef(int varx, int disp, int offs, bool asmfm); + void emitDispFrameRef(int varx, int disp, int offs, bool asmfm, instruction ins = INS_none); void emitDispInsAddr(const BYTE* code); void emitDispInsOffs(unsigned offs, bool doffs); void emitDispInsHex(instrDesc* id, BYTE* code, size_t sz); diff --git a/src/coreclr/jit/emitarm.cpp b/src/coreclr/jit/emitarm.cpp index f0696f010d3331..72320b3f069c3e 100644 --- a/src/coreclr/jit/emitarm.cpp +++ b/src/coreclr/jit/emitarm.cpp @@ -7810,7 +7810,7 @@ void emitter::emitDispIns( * Display a stack frame reference. */ -void emitter::emitDispFrameRef(int varx, int disp, int offs, bool asmfm) +void emitter::emitDispFrameRef(int varx, int disp, int offs, bool asmfm, instruction ins) { #ifdef DEBUG printf("["); diff --git a/src/coreclr/jit/emitarm64.cpp b/src/coreclr/jit/emitarm64.cpp index ec30c012bd5ee5..1929e78dd27137 100644 --- a/src/coreclr/jit/emitarm64.cpp +++ b/src/coreclr/jit/emitarm64.cpp @@ -14908,7 +14908,7 @@ void emitter::emitDispInsHelp( * Display a stack frame reference. */ -void emitter::emitDispFrameRef(int varx, int disp, int offs, bool asmfm) +void emitter::emitDispFrameRef(int varx, int disp, int offs, bool asmfm, instruction ins) { #ifdef DEBUG printf("["); diff --git a/src/coreclr/jit/emitloongarch64.cpp b/src/coreclr/jit/emitloongarch64.cpp index 716e15f392627f..a48d94d004520b 100644 --- a/src/coreclr/jit/emitloongarch64.cpp +++ b/src/coreclr/jit/emitloongarch64.cpp @@ -4627,7 +4627,7 @@ void emitter::emitDispIns( * * Display a stack frame reference. */ -void emitter::emitDispFrameRef(int varx, int disp, int offs, bool asmfm) +void emitter::emitDispFrameRef(int varx, int disp, int offs, bool asmfm, instruction ins) { NYI_LOONGARCH64("emitDispFrameRef-----unused on LoongArch64."); } diff --git a/src/coreclr/jit/emitriscv64.cpp b/src/coreclr/jit/emitriscv64.cpp index f1de046359846e..7222d12e64882c 100644 --- a/src/coreclr/jit/emitriscv64.cpp +++ b/src/coreclr/jit/emitriscv64.cpp @@ -4917,7 +4917,7 @@ void emitter::emitDispIns( * Display a stack frame reference. */ -void emitter::emitDispFrameRef(int varx, int disp, int offs, bool asmfm) +void emitter::emitDispFrameRef(int varx, int disp, int offs, bool asmfm, instruction ins) { NYI_RISCV64("emitDispFrameRef-----unimplemented/unused on RISCV64 yet----"); } diff --git a/src/coreclr/jit/emitxarch.cpp b/src/coreclr/jit/emitxarch.cpp index eadf46bb988a87..81e276f14ca1de 100644 --- a/src/coreclr/jit/emitxarch.cpp +++ b/src/coreclr/jit/emitxarch.cpp @@ -5251,6 +5251,64 @@ inline UNATIVE_OFFSET emitter::emitInsSizeRR(instrDesc* id) return sz; } +#if defined(TARGET_AMD64) +//------------------------------------------------------------------------ +// emitIsSecondFramePtrCandidate: should this stack access be redirected through the secondary +// frame-pointer register (see JitConfig.JitSecondFramePtr)? +// +// Arguments: +// ins -- the instruction +// EBPbased -- whether the canonical access is frame-pointer (RBP) based +// dsp -- the canonical (full effective) displacement off the base +// pAdjustedDsp - [out] the displacement off the secondary register +// +// Return Value: +// true if the access can use [REG_OPT_RSVD2 + disp8] instead of a disp32 form off the canonical +// base; false to keep the canonical base/displacement. +// +// Notes: +// Must be deterministic between size estimation and output. Redirects an access whose canonical +// displacement needs a disp32 but fits a disp8 once shifted by the secondary offset, and only when +// its base (RBP vs RSP) matches the base the secondary register shadows. Instructions with special +// displacement/encoding handling (EVEX, APX extended EVEX, SSE 0F38/0F3A, crc32) are excluded so the +// plain [reg+disp8] form always applies. +// +bool emitter::emitIsSecondFramePtrCandidate(instruction ins, bool EBPbased, int dsp, int* pAdjustedDsp) +{ + if (codeGen->genSecondFramePtrReg == REG_NA) + { + return false; + } + + // Only redirect accesses whose canonical base matches the base the secondary register shadows + // (RBP for FP-based frames, RSP otherwise). Mixing bases would compute a wrong displacement. + if (EBPbased != codeGen->genSecondFramePtrFPbased) + { + return false; + } + + if (IsEvexEncodableInstruction(ins) || IsApxExtendedEvexInstruction(ins) || EncodedBySSE38orSSE3A(ins) || + (ins == INS_crc32)) + { + return false; + } + + // The secondary register holds (base - offset) for RBP frames (locals at negative offsets) and + // (base + offset) for RSP frames (locals at positive offsets); invert accordingly. + const int adjusted = EBPbased ? (dsp + codeGen->genSecondFramePtrOffset) : (dsp - codeGen->genSecondFramePtrOffset); + const bool rawFits = ((signed char)dsp == (ssize_t)dsp); + const bool adjFits = ((signed char)adjusted == (ssize_t)adjusted); + + if (rawFits || !adjFits) + { + return false; + } + + *pAdjustedDsp = adjusted; + return true; +} +#endif // TARGET_AMD64 + //------------------------------------------------------------------------ // emitInsSizeSVCalcDisp: Calculate instruction size. // @@ -5276,6 +5334,16 @@ inline UNATIVE_OFFSET emitter::emitInsSizeSVCalcDisp(instrDesc* id, code_t code, adr = m_compiler->lvaFrameAddress(var, &EBPbased); dsp = adr + id->idAddr()->iiaLclVar.lvaOffset(); +#if defined(TARGET_AMD64) + // A redirected access uses a [reg+disp8] form with no SIB byte, regardless of the EVEX/zero-disp + // handling below. + int secondDsp; + if (emitSecondFramePtrActive && emitIsSecondFramePtrCandidate(ins, EBPbased, dsp, &secondDsp)) + { + return size + sizeof(char); + } +#endif // TARGET_AMD64 + dspIsZero = (dsp == 0); bool tryCompress = true; @@ -12225,10 +12293,10 @@ void emitter::emitDispClsVar(CORINFO_FIELD_HANDLE fldHnd, ssize_t offs, bool rel * Display a stack frame reference. */ -void emitter::emitDispFrameRef(int varx, int disp, int offs, bool asmfm) +void emitter::emitDispFrameRef(int varx, int disp, int offs, bool asmfm, instruction ins) { - int addr; - bool bEBP; + int addr = 0; + bool bEBP = false; printf("["); @@ -12253,6 +12321,20 @@ void emitter::emitDispFrameRef(int varx, int disp, int offs, bool asmfm) } } +#if defined(TARGET_AMD64) + // A redirected access is emitted as [REG_OPT_RSVD2 + disp8] rather than the canonical + // [rbp/rsp + disp]. Print what is actually encoded so the listing matches the bytes, then append + // the logical reference as a trailing comment. + bool secondRedirected = false; + int secondDsp = 0; + if (emitSecondFramePtrActive && (m_compiler->lvaDoneFrameLayout == Compiler::FINAL_FRAME_LAYOUT) && asmfm) + { + bool bEBPcand = false; + int rawDsp = m_compiler->lvaFrameAddress(varx, &bEBPcand) + disp; + secondRedirected = emitIsSecondFramePtrCandidate(ins, bEBPcand, rawDsp, &secondDsp); + } +#endif // TARGET_AMD64 + if (m_compiler->lvaDoneFrameLayout == Compiler::FINAL_FRAME_LAYOUT) { if (!asmfm) @@ -12262,44 +12344,72 @@ void emitter::emitDispFrameRef(int varx, int disp, int offs, bool asmfm) addr = m_compiler->lvaFrameAddress(varx, &bEBP) + disp; - if (bEBP) +#if defined(TARGET_AMD64) + if (secondRedirected) { - printf(STR_FPBASE); + printf("%s", emitRegName(REG_OPT_RSVD2, EA_PTRSIZE)); - if (addr < 0) + if (secondDsp < 0) { - printf("-0x%02X", -addr); + printf("-0x%02X", -secondDsp); } - else if (addr > 0) + else if (secondDsp > 0) { - printf("+0x%02X", addr); + printf("+0x%02X", secondDsp); } } else - { - /* Adjust the offset by amount currently pushed on the stack */ - - printf(STR_SPBASE); - - if (addr < 0) +#endif // TARGET_AMD64 + if (bEBP) { - printf("-0x%02X", -addr); + printf(STR_FPBASE); + + if (addr < 0) + { + printf("-0x%02X", -addr); + } + else if (addr > 0) + { + printf("+0x%02X", addr); + } } - else if (addr > 0) + else { - printf("+0x%02X", addr); - } + /* Adjust the offset by amount currently pushed on the stack */ + + printf(STR_SPBASE); + + if (addr < 0) + { + printf("-0x%02X", -addr); + } + else if (addr > 0) + { + printf("+0x%02X", addr); + } #if !FEATURE_FIXED_OUT_ARGS - if (emitCurStackLvl) - printf("+0x%02X", emitCurStackLvl); + if (emitCurStackLvl) + printf("+0x%02X", emitCurStackLvl); #endif // !FEATURE_FIXED_OUT_ARGS - } + } } printf("]"); + +#if defined(TARGET_AMD64) + if (secondRedirected) + { + // Defer the canonical frame reference to a trailing comment: operands may still follow the + // memory operand on the same line. + emitDispSecondFramePtrPending = true; + emitDispSecondFramePtrFPbased = bEBP; + emitDispSecondFramePtrAddr = addr; + } +#endif // TARGET_AMD64 + #ifdef DEBUG if ((varx >= 0) && m_compiler->opts.varNames && (((IL_OFFSET)offs) != BAD_IL_OFFSET)) { @@ -12842,6 +12952,10 @@ void emitter::emitDispIns( instruction ins = id->idIns(); +#if defined(TARGET_AMD64) + emitDispSecondFramePtrPending = false; +#endif // TARGET_AMD64 + #ifdef DEBUG if (m_compiler->verbose) { @@ -13235,7 +13349,7 @@ void emitter::emitDispIns( #endif emitDispFrameRef(id->idAddr()->iiaLclVar.lvaVarNum(), id->idAddr()->iiaLclVar.lvaOffset(), - id->idDebugOnlyInfo()->idVarRefOffs, asmfm); + id->idDebugOnlyInfo()->idVarRefOffs, asmfm, ins); #if !FEATURE_FIXED_OUT_ARGS if (ins == INS_pop) @@ -13254,7 +13368,7 @@ void emitter::emitDispIns( printf("%s", sstr); emitDispFrameRef(id->idAddr()->iiaLclVar.lvaVarNum(), id->idAddr()->iiaLclVar.lvaOffset(), - id->idDebugOnlyInfo()->idVarRefOffs, asmfm); + id->idDebugOnlyInfo()->idVarRefOffs, asmfm, ins); emitDispEmbMasking(id); printf(", %s", emitRegName(id->idReg1(), attr)); @@ -13269,7 +13383,7 @@ void emitter::emitDispIns( printf("%s", sstr); emitDispFrameRef(id->idAddr()->iiaLclVar.lvaVarNum(), id->idAddr()->iiaLclVar.lvaOffset(), - id->idDebugOnlyInfo()->idVarRefOffs, asmfm); + id->idDebugOnlyInfo()->idVarRefOffs, asmfm, ins); emitDispEmbMasking(id); emitDispConstant(id); break; @@ -13283,7 +13397,7 @@ void emitter::emitDispIns( printf("%s", sstr); emitDispFrameRef(id->idAddr()->iiaLclVar.lvaVarNum(), id->idAddr()->iiaLclVar.lvaOffset(), - id->idDebugOnlyInfo()->idVarRefOffs, asmfm); + id->idDebugOnlyInfo()->idVarRefOffs, asmfm, ins); emitDispEmbMasking(id); printf(", %s", emitRegName(id->idReg1(), attr)); @@ -13317,7 +13431,7 @@ void emitter::emitDispIns( emitDispEmbMasking(id); printf(", %s", sstr); emitDispFrameRef(id->idAddr()->iiaLclVar.lvaVarNum(), id->idAddr()->iiaLclVar.lvaOffset(), - id->idDebugOnlyInfo()->idVarRefOffs, asmfm); + id->idDebugOnlyInfo()->idVarRefOffs, asmfm, ins); emitDispEmbBroadcastCount(id); break; @@ -13331,7 +13445,7 @@ void emitter::emitDispIns( emitDispEmbMasking(id); printf(", %s", sstr); emitDispFrameRef(id->idAddr()->iiaLclVar.lvaVarNum(), id->idAddr()->iiaLclVar.lvaOffset(), - id->idDebugOnlyInfo()->idVarRefOffs, asmfm); + id->idDebugOnlyInfo()->idVarRefOffs, asmfm, ins); emitDispEmbBroadcastCount(id); emitDispConstant(id); break; @@ -13351,7 +13465,7 @@ void emitter::emitDispIns( printf("%s", emitRegName(id->idReg1(), attr)); printf(", %s", sstr); emitDispFrameRef(id->idAddr()->iiaLclVar.lvaVarNum(), id->idAddr()->iiaLclVar.lvaOffset(), - id->idDebugOnlyInfo()->idVarRefOffs, asmfm); + id->idDebugOnlyInfo()->idVarRefOffs, asmfm, ins); printf(", %s", emitRegName(id->idReg2(), attr)); break; } @@ -13360,7 +13474,7 @@ void emitter::emitDispIns( emitDispEmbMasking(id); printf(", %s, %s", emitRegName(id->idReg2(), attr), sstr); emitDispFrameRef(id->idAddr()->iiaLclVar.lvaVarNum(), id->idAddr()->iiaLclVar.lvaOffset(), - id->idDebugOnlyInfo()->idVarRefOffs, asmfm); + id->idDebugOnlyInfo()->idVarRefOffs, asmfm, ins); emitDispEmbBroadcastCount(id); break; } @@ -13371,7 +13485,7 @@ void emitter::emitDispIns( emitDispEmbMasking(id); printf(", %s, %s", emitRegName(id->idReg2(), attr), sstr); emitDispFrameRef(id->idAddr()->iiaLclVar.lvaVarNum(), id->idAddr()->iiaLclVar.lvaOffset(), - id->idDebugOnlyInfo()->idVarRefOffs, asmfm); + id->idDebugOnlyInfo()->idVarRefOffs, asmfm, ins); emitDispEmbBroadcastCount(id); emitDispConstant(id); break; @@ -13394,7 +13508,7 @@ void emitter::emitDispIns( printf(", %s, %s", emitRegName(id->idReg2(), attr), sstr); emitDispFrameRef(id->idAddr()->iiaLclVar.lvaVarNum(), id->idAddr()->iiaLclVar.lvaOffset(), - id->idDebugOnlyInfo()->idVarRefOffs, asmfm); + id->idDebugOnlyInfo()->idVarRefOffs, asmfm, ins); emitDispEmbBroadcastCount(id); if (!hasMaskReg) @@ -13413,7 +13527,7 @@ void emitter::emitDispIns( printf(", %s", sstr); emitDispFrameRef(id->idAddr()->iiaLclVar.lvaVarNum(), id->idAddr()->iiaLclVar.lvaOffset(), - id->idDebugOnlyInfo()->idVarRefOffs, asmfm); + id->idDebugOnlyInfo()->idVarRefOffs, asmfm, ins); emitDispEmbBroadcastCount(id); printf(", %s", emitRegName(id->idReg2(), attr)); break; @@ -13423,7 +13537,7 @@ void emitter::emitDispIns( { printf("%s", sstr); emitDispFrameRef(id->idAddr()->iiaLclVar.lvaVarNum(), id->idAddr()->iiaLclVar.lvaOffset(), - id->idDebugOnlyInfo()->idVarRefOffs, asmfm); + id->idDebugOnlyInfo()->idVarRefOffs, asmfm, ins); emitDispEmbMasking(id); printf(", %s, %s", emitRegName(id->idReg1(), attr), emitRegName(id->idReg2(), attr)); break; @@ -14121,7 +14235,7 @@ void emitter::emitDispIns( assert(id->idInsFmt() == IF_SWR_LABEL); instrDescLbl* idlbl = (instrDescLbl*)id; - emitDispFrameRef(idlbl->dstLclVar.lvaVarNum(), idlbl->dstLclVar.lvaOffset(), 0, asmfm); + emitDispFrameRef(idlbl->dstLclVar.lvaVarNum(), idlbl->dstLclVar.lvaOffset(), 0, asmfm, ins); printf(", "); } @@ -14195,6 +14309,26 @@ void emitter::emitDispIns( } #endif +#if defined(TARGET_AMD64) + if (emitDispSecondFramePtrPending) + { + // The memory operand was emitted as [REG_OPT_RSVD2 + disp8]; show the canonical frame + // reference it stands in for as a trailing comment. + printf(" ; %s", emitDispSecondFramePtrFPbased ? STR_FPBASE : STR_SPBASE); + + if (emitDispSecondFramePtrAddr < 0) + { + printf("-0x%02X", -emitDispSecondFramePtrAddr); + } + else if (emitDispSecondFramePtrAddr > 0) + { + printf("+0x%02X", emitDispSecondFramePtrAddr); + } + + emitDispSecondFramePtrPending = false; + } +#endif // TARGET_AMD64 + printf("\n"); } @@ -15778,98 +15912,111 @@ BYTE* emitter::emitOutputSV(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc) // for stack variables the dsp should never be a reloc assert(id->idIsDspReloc() == 0); - if (EBPbased) +#if defined(TARGET_AMD64) + int secondDsp; + if (emitSecondFramePtrActive && emitIsSecondFramePtrCandidate(ins, EBPbased, dsp, &secondDsp)) { - // EBP-based variable: does the offset fit in a byte? - if (EncodedBySSE38orSSE3A(ins) || (ins == INS_crc32)) + // Redirect through the secondary frame pointer (a low callee-saved register, e.g. RBX): + // modrm mod=01, rm=base => low byte 0x40 | (reg & 7); no SIB byte and no REX.B since the + // register is < R8 and is not RSP/R12. + assert((unsigned)REG_OPT_RSVD2 < 8); + dst += emitOutputWord(dst, code | (((0x40 | (REG_OPT_RSVD2 & 0x07))) << 8)); + dst += emitOutputByte(dst, secondDsp); + } + else +#endif // TARGET_AMD64 + if (EBPbased) { - if (dspInByte) + // EBP-based variable: does the offset fit in a byte? + if (EncodedBySSE38orSSE3A(ins) || (ins == INS_crc32)) + { + if (dspInByte) + { + dst += emitOutputByte(dst, code | 0x45); + dst += emitOutputByte(dst, dsp); + } + else + { + dst += emitOutputByte(dst, code | 0x85); + dst += emitOutputLong(dst, dsp); + } + } + else if (dspInByte) { - dst += emitOutputByte(dst, code | 0x45); + dst += emitOutputWord(dst, code | 0x4500); dst += emitOutputByte(dst, dsp); } else { - dst += emitOutputByte(dst, code | 0x85); + dst += emitOutputWord(dst, code | 0x8500); dst += emitOutputLong(dst, dsp); } } - else if (dspInByte) - { - dst += emitOutputWord(dst, code | 0x4500); - dst += emitOutputByte(dst, dsp); - } else { - dst += emitOutputWord(dst, code | 0x8500); - dst += emitOutputLong(dst, dsp); - } - } - else - { #if !FEATURE_FIXED_OUT_ARGS - // Adjust the offset by the amount currently pushed on the CPU stack - dsp += emitCurStackLvl; + // Adjust the offset by the amount currently pushed on the CPU stack + dsp += emitCurStackLvl; - if (IsEvexEncodableInstruction(ins) || IsApxExtendedEvexInstruction(ins)) - { - // We cannot reliably predict the encoding size up front so we shouldn't - // have encountered a scenario marked with compressed displacement. We - // did predict cases that could use the small encoding for VEX scenarios + if (IsEvexEncodableInstruction(ins) || IsApxExtendedEvexInstruction(ins)) + { + // We cannot reliably predict the encoding size up front so we shouldn't + // have encountered a scenario marked with compressed displacement. We + // did predict cases that could use the small encoding for VEX scenarios - assert(!HasCompressedDisplacement(id)); + assert(!HasCompressedDisplacement(id)); - if (!TakesEvexPrefix(id)) + if (!TakesEvexPrefix(id)) + { + dspInByte = ((signed char)dsp == (ssize_t)dsp); + } + } + else { dspInByte = ((signed char)dsp == (ssize_t)dsp); } - } - else - { - dspInByte = ((signed char)dsp == (ssize_t)dsp); - } - dspIsZero = (dsp == 0); + dspIsZero = (dsp == 0); #endif // !FEATURE_FIXED_OUT_ARGS - // Does the offset fit in a byte? - if (EncodedBySSE38orSSE3A(ins) || (ins == INS_crc32)) - { - if (dspIsZero) + // Does the offset fit in a byte? + if (EncodedBySSE38orSSE3A(ins) || (ins == INS_crc32)) { - dst += emitOutputByte(dst, code | 0x04); + if (dspIsZero) + { + dst += emitOutputByte(dst, code | 0x04); + dst += emitOutputByte(dst, 0x24); + } + else if (dspInByte) + { + dst += emitOutputByte(dst, code | 0x44); + dst += emitOutputByte(dst, 0x24); + dst += emitOutputByte(dst, dsp); + } + else + { + dst += emitOutputByte(dst, code | 0x84); + dst += emitOutputByte(dst, 0x24); + dst += emitOutputLong(dst, dsp); + } + } + else if (dspIsZero) + { + dst += emitOutputWord(dst, code | 0x0400); dst += emitOutputByte(dst, 0x24); } else if (dspInByte) { - dst += emitOutputByte(dst, code | 0x44); + dst += emitOutputWord(dst, code | 0x4400); dst += emitOutputByte(dst, 0x24); dst += emitOutputByte(dst, dsp); } else { - dst += emitOutputByte(dst, code | 0x84); + dst += emitOutputWord(dst, code | 0x8400); dst += emitOutputByte(dst, 0x24); dst += emitOutputLong(dst, dsp); } } - else if (dspIsZero) - { - dst += emitOutputWord(dst, code | 0x0400); - dst += emitOutputByte(dst, 0x24); - } - else if (dspInByte) - { - dst += emitOutputWord(dst, code | 0x4400); - dst += emitOutputByte(dst, 0x24); - dst += emitOutputByte(dst, dsp); - } - else - { - dst += emitOutputWord(dst, code | 0x8400); - dst += emitOutputByte(dst, 0x24); - dst += emitOutputLong(dst, dsp); - } - } // Now generate the constant value, if present if (addc) diff --git a/src/coreclr/jit/emitxarch.h b/src/coreclr/jit/emitxarch.h index db536ab58ce677..40bc2ef0f43378 100644 --- a/src/coreclr/jit/emitxarch.h +++ b/src/coreclr/jit/emitxarch.h @@ -65,6 +65,12 @@ struct CnsVal UNATIVE_OFFSET emitInsSize(instrDesc* id, code_t code, bool includeRexPrefixSize); UNATIVE_OFFSET emitInsSizeSVCalcDisp(instrDesc* id, code_t code, int var, int dsp); +#if defined(TARGET_AMD64) +// Set once per method in emitBegFN: true only when codegen reserved a secondary frame pointer. Lets the +// per-stack-access fast path skip the candidate check (and its codeGen deref) in the common case. +bool emitSecondFramePtrActive; +bool emitIsSecondFramePtrCandidate(instruction ins, bool EBPbased, int dsp, int* pAdjustedDsp); +#endif // TARGET_AMD64 UNATIVE_OFFSET emitInsSizeSV(instrDesc* id, code_t code, int var, int dsp); UNATIVE_OFFSET emitInsSizeSV(instrDesc* id, code_t code, int var, int dsp, int val); UNATIVE_OFFSET emitInsSizeRR(instrDesc* id, code_t code); @@ -814,6 +820,15 @@ void emitDispReloc(ssize_t value) const; void emitDispAddrMode(instrDesc* id, bool noDetail = false) const; void emitDispShift(instruction ins, int cnt = 0) const; +#if defined(TARGET_AMD64) +// Display state for secondary frame-pointer redirects (see emitDispFrameRef): the operand shows +// [rbx+disp8] and the canonical frame reference is emitted as a trailing comment, finalized at the +// end of the instruction line. +bool emitDispSecondFramePtrPending = false; +bool emitDispSecondFramePtrFPbased = false; +int emitDispSecondFramePtrAddr = 0; +#endif // TARGET_AMD64 + const char* emitXMMregName(unsigned reg) const; const char* emitYMMregName(unsigned reg) const; const char* emitZMMregName(unsigned reg) const; diff --git a/src/coreclr/jit/jitconfigvalues.h b/src/coreclr/jit/jitconfigvalues.h index 6e36a55a67ae02..a768eda8066580 100644 --- a/src/coreclr/jit/jitconfigvalues.h +++ b/src/coreclr/jit/jitconfigvalues.h @@ -699,6 +699,12 @@ RELEASE_CONFIG_INTEGER(JitInlinePolicyProfile, "JitInlinePolicyProfile", 0) RELEASE_CONFIG_INTEGER(JitInlinePolicyProfileThreshold, "JitInlinePolicyProfileThreshold", 40) CONFIG_STRING(JitObjectStackAllocationRange, "JitObjectStackAllocationRange") RELEASE_CONFIG_INTEGER(JitObjectStackAllocation, "JitObjectStackAllocation", 1) + +// When non-zero, reserve a callee-saved register as a secondary stack base pointer, offset by this +// many bytes from the primary base, to address far locals with a disp8 displacement. 0 disables. +// 256 is canonical: it tiles the two disp8 windows contiguously for a 512-byte cheap range; other +// values overlap (less reach) or leave a gap. x64 only. +RELEASE_CONFIG_INTEGER(JitSecondFramePtr, "JitSecondFramePtr", 0x100) RELEASE_CONFIG_INTEGER(JitObjectStackAllocationRefClass, "JitObjectStackAllocationRefClass", 1) RELEASE_CONFIG_INTEGER(JitObjectStackAllocationBoxedValueClass, "JitObjectStackAllocationBoxedValueClass", 1) RELEASE_CONFIG_INTEGER(JitObjectStackAllocationConditionalEscape, "JitObjectStackAllocationConditionalEscape", 1) diff --git a/src/coreclr/jit/lsra.cpp b/src/coreclr/jit/lsra.cpp index 3c8a99a354bae8..411854a6cf84e9 100644 --- a/src/coreclr/jit/lsra.cpp +++ b/src/coreclr/jit/lsra.cpp @@ -2588,6 +2588,70 @@ void LinearScan::setFrameType() } #endif // TARGET_ARMARCH || TARGET_RISCV64 +#if defined(TARGET_AMD64) + // Consider reserving a callee-saved register as a secondary stack base pointer holding + // (primaryBase +/- offset), so far locals in a large frame can use cheap disp8 addressing + // (see JitSecondFramePtr). + { + const int secondFramePtrOffset = (int)JitConfig.JitSecondFramePtr(); + + // Only a win with optimizations disabled: with opts on, enregistration makes the pre-layout + // frame estimate a poor predictor and removing a register from allocation raises spill pressure. + const bool optDisabled = m_compiler->opts.OptimizationDisabled(); + + // We need a single base register (RBP or RSP) that is fixed for the whole method body. Other + // frame types (e.g. double-aligned) are not handled. + const bool haveFixedBase = (frameType == FT_EBP_FRAME) || (frameType == FT_ESP_FRAME); + + // With EH we require an RBP frame: filter funclets re-establish the pointer from RBP (the + // establisher frame pointer), so an RSP base would not be recoverable inside a funclet. + const bool ehCompatible = (m_compiler->compHndBBtabCount == 0) || (frameType == FT_EBP_FRAME); + + // OSR reuses the Tier0 frame with a bespoke callee-save/SP setup the secondary-pointer prolog + // does not handle. OSR is normally optimized, so optDisabled already excludes it; this guard only + // matters under stress modes that force MinOpts on an OSR method. + const bool notOsr = !m_compiler->opts.IsOSR(); + + // The frame must be large enough that some local can fall outside the primary disp8 window. + // x64 has no REGALLOC-time layout to consult, so rather than run a full lvaFrameSize pass just to + // gate this (opt-disabled-only) reservation, estimate the local area cheaply: locals sit above the + // outgoing-arg space, so sum that plus the stack-home sizes until the window is exceeded. The estimate + // need not be exact (it omits temps/callee-saves): genSecondFramePtrIsProfitable re-checks with FINAL + // offsets and cancels the reservation if no local lands in the secondary band. + auto frameLikelyLargeEnough = [this]() -> bool { + unsigned size = m_compiler->lvaOutgoingArgSpaceSize; + for (unsigned lclNum = 0; lclNum < m_compiler->lvaCount; lclNum++) + { + size += m_compiler->lvaLclStackHomeSize(lclNum); + if (size > 256) + { + return true; + } + } + return false; + }; + + if ((secondFramePtrOffset != 0) && optDisabled && haveFixedBase && ehCompatible && notOsr && + frameLikelyLargeEnough()) + { + // Reserve the register only as a candidate: remove it from allocation now, but defer the + // real decision -- does any local land in the secondary disp8 band -- to genFinalizeFrame + // (genSecondFramePtrIsProfitable), which can still cancel the reservation. The REGALLOC-layout + // offsets available here are unreliable (base-register flag not yet set, offsets inflated by + // an over-estimated callee-save area) so cannot drive the band test. + const bool wantFPbased = (frameType == FT_EBP_FRAME); + m_compiler->codeGen->genSecondFramePtrReg = REG_OPT_RSVD2; + m_compiler->codeGen->genSecondFramePtrOffset = secondFramePtrOffset; + m_compiler->codeGen->genSecondFramePtrFPbased = wantFPbased; + m_compiler->codeGen->regSet.rsMaskResvd |= RBM_OPT_RSVD2; + removeMask |= RBM_OPT_RSVD2.GetIntRegSet(); + JITDUMP(" Reserved REG_OPT_RSVD2 (%s) as candidate secondary frame pointer (%s%s%d)\n", + getRegName(REG_OPT_RSVD2), wantFPbased ? "RBP" : "RSP", wantFPbased ? "-" : "+", + secondFramePtrOffset); + } + } +#endif // TARGET_AMD64 + #ifdef TARGET_ARM if (m_compiler->compLocallocUsed) { diff --git a/src/coreclr/jit/targetamd64.h b/src/coreclr/jit/targetamd64.h index 55c0dae697d7b6..309d58b1e8c276 100644 --- a/src/coreclr/jit/targetamd64.h +++ b/src/coreclr/jit/targetamd64.h @@ -417,6 +417,11 @@ #define RBM_SPBASE RBM_ESP #define STR_SPBASE "rsp" +// Secondary stack base pointer register (see JitSecondFramePtr). A low callee-saved register +// (RBX) avoids REX.B / SIB encoding complications. +#define REG_OPT_RSVD2 REG_EBX +#define RBM_OPT_RSVD2 RBM_EBX + #define FIRST_ARG_STACK_OFFS (REGSIZE_BYTES) // return address #ifdef UNIX_AMD64_ABI