From 8c5e37ce7ba23579c216b981485c1575e59882b2 Mon Sep 17 00:00:00 2001
From: Tony Arcieri <bascule@gmail.com>
Date: Wed, 17 Jun 2026 22:17:27 -0600
Subject: [PATCH] k256: w-NAF linear combinations / multiscalar multiplications
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Wires up `WnafBase::multiscalar_mul` as the backend for
`LinearCombination::lincomb_vartime` when the `alloc` feature is
enabled.

Provides a ~16% speedup for the 2-term case.

high-level operations/lincomb_vartime (2-term)
    time:   [29.102 µs 29.277 µs 29.465 µs]
    change: [−16.175% −15.781% −15.426%] (p = 0.00 < 0.05)
    Performance has improved.
---
 k256/benches/scalar.rs          | 12 +++++-
 k256/src/arithmetic/mul.rs      | 74 +++++++++++++++++++++++++--------
 k256/src/arithmetic/mul/wnaf.rs | 16 ++++++-
 3 files changed, 82 insertions(+), 20 deletions(-)
diff --git a/k256/benches/scalar.rs b/k256/benches/scalar.rs
index f265b9022..8fe27c92a 100644
--- a/k256/benches/scalar.rs
+++ b/k256/benches/scalar.rs
@@ -54,14 +54,22 @@ fn bench_point_lincomb<M: Measurement>(group: &mut BenchmarkGroup<'_, M>) {
     let p = ProjectivePoint::GENERATOR;
     let m = hex!("AA5E28D6A97A2479A65527F7290311A3624D4CC0FA1578598EE3C2613BF99522");
     let s = Scalar::from_repr(m.into()).unwrap();
-    group.bench_function("lincomb via mul+add", |b| {
+    group.bench_function("lincomb (unoptimized, 2-term)", |b| {
         b.iter(|| black_box(p) * black_box(s) + black_box(p) * black_box(s))
     });
-    group.bench_function("lincomb()", |b| {
+    group.bench_function("lincomb (optimized, 2-term)", |b| {
         b.iter(|| {
             ProjectivePoint::lincomb(&[(black_box(p), black_box(s)), (black_box(p), black_box(s))])
         })
     });
+    group.bench_function("lincomb_vartime (2-term)", |b| {
+        b.iter(|| {
+            ProjectivePoint::lincomb_vartime(&[
+                (black_box(p), black_box(s)),
+                (black_box(p), black_box(s)),
+            ])
+        })
+    });
 }
 
 fn bench_point_mul_by_generator<M: Measurement>(group: &mut BenchmarkGroup<'_, M>) {
diff --git a/k256/src/arithmetic/mul.rs b/k256/src/arithmetic/mul.rs
index 65b2a027f..a7f89f654 100644
--- a/k256/src/arithmetic/mul.rs
+++ b/k256/src/arithmetic/mul.rs
@@ -53,9 +53,9 @@ use primeorder::Radix16Decomposition;
 #[cfg(feature = "alloc")]
 use {
     self::wnaf::{WnafBase, WnafScalar},
+    alloc::vec::Vec,
     primeorder::PrimeFieldExt,
 };
-
 #[cfg(feature = "precomputed-tables")]
 use {super::tables::BASEPOINT_TABLE, elliptic_curve::array::sizes::U65};
 
@@ -224,10 +224,18 @@ impl<const N: usize> LinearCombination<[(ProjectivePoint, Scalar); N]> for Proje
     }
 
     fn lincomb_vartime(points_and_scalars: &[(ProjectivePoint, Scalar); N]) -> Self {
-        let mut tables = [(LookupTable::default(), LookupTable::default()); N];
-        let mut digits: [(Radix16Decomposition<U33>, Radix16Decomposition<U33>); N] =
-            array::from_fn(|_| Default::default());
-        lincomb_vartime(points_and_scalars, &mut tables, &mut digits)
+        #[cfg(not(feature = "alloc"))]
+        {
+            let mut tables = [(LookupTable::default(), LookupTable::default()); N];
+            let mut digits: [(Radix16Decomposition<U33>, Radix16Decomposition<U33>); N] =
+                array::from_fn(|_| Default::default());
+            lincomb_vartime(points_and_scalars, &mut tables, &mut digits)
+        }
+
+        #[cfg(feature = "alloc")]
+        {
+            lincomb_vartime(points_and_scalars)
+        }
     }
 }
 
@@ -249,17 +257,25 @@ impl LinearCombination<[(ProjectivePoint, Scalar)]> for ProjectivePoint {
 
     #[cfg(feature = "alloc")]
     fn lincomb_vartime(points_and_scalars: &[(ProjectivePoint, Scalar)]) -> Self {
-        let mut tables =
-            vec![(LookupTable::default(), LookupTable::default()); points_and_scalars.len()];
-        let mut digits = vec![
-            (
-                Radix16Decomposition::<U33>::default(),
-                Radix16Decomposition::<U33>::default(),
-            );
-            points_and_scalars.len()
-        ];
+        #[cfg(not(feature = "alloc"))]
+        {
+            let mut tables =
+                vec![(LookupTable::default(), LookupTable::default()); points_and_scalars.len()];
+            let mut digits = vec![
+                (
+                    Radix16Decomposition::<U33>::default(),
+                    Radix16Decomposition::<U33>::default(),
+                );
+                points_and_scalars.len()
+            ];
+
+            lincomb_vartime(points_and_scalars, &mut tables, &mut digits)
+        }
 
-        lincomb_vartime(points_and_scalars, &mut tables, &mut digits)
+        #[cfg(feature = "alloc")]
+        {
+            lincomb_vartime(points_and_scalars)
+        }
     }
 }
 
@@ -318,7 +334,12 @@ fn lincomb(
 }
 
 /// Linear combination (a.k.a. multiscalar multiplication) implemented in variable-time.
+///
+/// This implementation reuses the radix-16 decomposition from the constant-time implementation and
+/// has no `alloc` dependency but is slower than w-NAF, so it is only used when `alloc` isn't
+/// available.
 // TODO(tarcieri): fully eliminate constant-time constructions
+#[cfg(not(feature = "alloc"))]
 fn lincomb_vartime(
     xks: &[(ProjectivePoint, Scalar)],
     tables: &mut [(LookupTable, LookupTable)],
@@ -372,6 +393,25 @@ fn lincomb_vartime(
     acc
 }
 
+/// Linear combination (a.k.a. multiscalar multiplication) implemented in variable-time.
+///
+/// This implementation uses w-NAF and provides the best performance but requires `alloc`.
+#[cfg(feature = "alloc")]
+fn lincomb_vartime(xks: &[(ProjectivePoint, Scalar)]) -> ProjectivePoint {
+    let mut bases: Vec<WnafBase<ProjectivePoint, WNAF_WINDOW>> = Vec::with_capacity(xks.len() * 2);
+    let mut scalars: Vec<WnafScalar<Scalar, WNAF_WINDOW>> = Vec::with_capacity(xks.len() * 2);
+
+    for (x, k) in xks {
+        let ([b0, b1], [s0, s1]) = decompose_glv_wnaf(x, k);
+        bases.push(b0);
+        bases.push(b1);
+        scalars.push(s0);
+        scalars.push(s1);
+    }
+
+    WnafBase::multiscalar_mul(scalars, bases)
+}
+
 impl ProjectivePoint {
     /// Calculates `k * G`, where `G` is the generator.
     pub fn mul_by_generator(k: &Scalar) -> ProjectivePoint {
@@ -443,7 +483,7 @@ fn mul_vartime(x: &ProjectivePoint, k: &Scalar) -> ProjectivePoint {
 #[cfg(feature = "alloc")]
 fn mul_vartime(x: &ProjectivePoint, k: &Scalar) -> ProjectivePoint {
     let (bases, scalars) = decompose_glv_wnaf(x, k);
-    WnafBase::multiscalar_mul(&scalars, &bases)
+    WnafBase::multiscalar_mul_array(&scalars, &bases)
 }
 
 /// GLV-decompose `k` for `x`: two `(WnafBase, WnafScalar)` pairs representing `r1 * self_signed`
@@ -540,7 +580,7 @@ impl MulByGeneratorVartime for ProjectivePoint {
     fn mul_by_generator_and_mul_add_vartime(a: &Self::Scalar, b: &Self::Scalar, p: &Self) -> Self {
         let ([gb0, gb1], [gs0, gs1]) = decompose_glv_wnaf(&ProjectivePoint::GENERATOR, a);
         let ([pb0, pb1], [ps0, ps1]) = decompose_glv_wnaf(p, b);
-        WnafBase::multiscalar_mul(&[gs0, gs1, ps0, ps1], &[gb0, gb1, pb0, pb1])
+        WnafBase::multiscalar_mul_array(&[gs0, gs1, ps0, ps1], &[gb0, gb1, pb0, pb1])
     }
 }
 
diff --git a/k256/src/arithmetic/mul/wnaf.rs b/k256/src/arithmetic/mul/wnaf.rs
index 2ee4c460e..136a11ed1 100644
--- a/k256/src/arithmetic/mul/wnaf.rs
+++ b/k256/src/arithmetic/mul/wnaf.rs
@@ -64,8 +64,22 @@ impl<G: Group, const WINDOW_SIZE: usize> WnafBase<G, WINDOW_SIZE> {
     ///
     /// Computes a sum-of-products `aA + bB + ...` in variable time with w-NAF multi-exponentiation
     /// using the interleaved window method, also known as Straus' method.
+    pub fn multiscalar_mul<I, J>(scalars: I, bases: J) -> G
+    where
+        I: IntoIterator<Item = WnafScalar<G::Scalar, WINDOW_SIZE>>,
+        J: IntoIterator<Item = Self>,
+    {
+        let wnafs = scalars.into_iter().map(|s| s.wnaf).collect::<Vec<_>>();
+        let tables = bases.into_iter().map(|b| b.table).collect::<Vec<_>>();
+        wnaf_multi_exp(tables.as_slice(), wnafs.as_slice())
+    }
+
+    /// Perform a multiscalar multiplication with fixed-size array arguments.
+    ///
+    /// Computes a sum-of-products `aA + bB + ...` in variable time with w-NAF multi-exponentiation
+    /// using the interleaved window method, also known as Straus' method.
     #[must_use]
-    pub fn multiscalar_mul<const N: usize>(
+    pub fn multiscalar_mul_array<const N: usize>(
         scalars: &[WnafScalar<G::Scalar, WINDOW_SIZE>; N],
         bases: &[Self; N],
     ) -> G {