From 6e5a99800986472ecf5233bf82ec5c032ce96ab5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pawe=C5=82=20Bylica?= Date: Thu, 2 Apr 2026 18:42:29 +0200 Subject: [PATCH] crypto: Use local accumulator t[] in mul_amm_256 to avoid aliasing MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Use a local array t[4] instead of writing directly to the output span r, which may alias the inputs x or y. This allows the compiler to keep the accumulator in registers without reloading after stores. The result is copied to r at the end. ~6% speedup on 256-bit modexp benchmarks (25846 → 24256 cycles). --- lib/evmone_precompiles/mulmod.cpp | 41 +++++++++++++++++-------------- 1 file changed, 23 insertions(+), 18 deletions(-) diff --git a/lib/evmone_precompiles/mulmod.cpp b/lib/evmone_precompiles/mulmod.cpp index c2af255ae0..5f729b85c7 100644 --- a/lib/evmone_precompiles/mulmod.cpp +++ b/lib/evmone_precompiles/mulmod.cpp @@ -10,40 +10,45 @@ void mul_amm_256(std::span r, std::span x, std::span y, std::span mod, uint64_t mod_inv) noexcept { static constexpr size_t N = 4; - const auto r_lo = r.subspan<0, 3>(); - const auto r_hi = r.subspan<1>(); + + // Local accumulator t[] avoids aliasing penalties when r overlaps x or y. + std::array t; // NOLINT(*-pro-type-member-init) + const auto t_lo = std::span{t}.subspan<0, N - 1>(); + const auto t_hi = std::span{t}.subspan<1>(); const auto mod_hi = mod.subspan<1>(); - // First iteration: r is uninitialized, so use mul instead of addmul. - bool r_carry = false; + // First iteration: t is uninitialized, so use mul instead of addmul. + bool t_carry = false; { - const auto c1 = mul(r, x, y[0]); + const auto c1 = mul(t, x, y[0]); - const auto m = r[0] * mod_inv; - const auto c2 = (umul(mod[0], m) + r[0])[1]; + const auto m = t[0] * mod_inv; + const auto c2 = (umul(mod[0], m) + t[0])[1]; - const auto c3 = addmul(r_lo, r_hi, mod_hi, m, c2); - std::tie(r[N - 1], r_carry) = addc(c1, c3); + const auto c3 = addmul(t_lo, t_hi, mod_hi, m, c2); + std::tie(t[N - 1], t_carry) = addc(c1, c3); } // Remaining 3 iterations. #pragma GCC unroll N - 1 for (size_t i = 1; i != N; ++i) { - const auto c1 = addmul(r, r, x, y[i]); - const auto [sum1, d1] = addc(c1, uint64_t{r_carry}); + const auto c1 = addmul(t, t, x, y[i]); + const auto [sum1, d1] = addc(c1, uint64_t{t_carry}); - const auto m = r[0] * mod_inv; - const auto c2 = (umul(mod[0], m) + r[0])[1]; + const auto m = t[0] * mod_inv; + const auto c2 = (umul(mod[0], m) + t[0])[1]; - const auto c3 = addmul(r_lo, r_hi, mod_hi, m, c2); + const auto c3 = addmul(t_lo, t_hi, mod_hi, m, c2); const auto [sum2, d2] = addc(sum1, c3); - r[N - 1] = sum2; + t[N - 1] = sum2; assert(!(d1 && d2)); - r_carry = d1 || d2; + t_carry = d1 || d2; } - if (r_carry) - sub(r, mod); + if (t_carry) + sub(t, mod); + + std::ranges::copy(t, r.begin()); } } // namespace evmone::crypto