Speed up MSMs for non-GPU accelerated MSMs and architectures that don't support GPU/semolina (#126)

* WASM target support * fast multiexp for WASM * add parallelisation for MSM https://github.com/zcash/halo2/blob/main/halo2_proofs/src/arithmetic.rs
2026-02-27 21:36:41 +01:00 · 2023-01-27 09:51:08 -08:00
parent d35604fe15
commit 6c6a8746d6
2 changed files with 155 additions and 14 deletions
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -23,7 +23,6 @@ rand_chacha = "0.3"
 itertools = "0.9.0"
 subtle = "2.4"
 pasta_curves = { version = "0.4.0", features = ["repr-c"] }
 pasta-msm = "0.1.3"
 neptune = { version = "8.1.0", default-features = false }
 generic-array = "0.14.4"
 num-bigint = { version = "0.4", features = ["serde", "rand"] }
@@ -47,4 +46,7 @@ name = "compressed-snark"
 harness = false
 [features]
-default = [ "bellperson/default", "neptune/default" ]
+default = ["bellperson/default", "neptune/default"]
 [target.'cfg(any(target_arch = "x86_64", target_arch = "aarch64"))'.dependencies]
 pasta-msm = "0.1.3"
--- a/src/pasta.rs
+++ b/src/pasta.rs
@@ -16,9 +16,138 @@ use pasta_curves::{
 };
 use rand::SeedableRng;
 use rand_chacha::ChaCha20Rng;
 use rayon::prelude::*;
 use sha3::Shake256;
-use std::{io::Read, ops::Mul};
+use std::io::Read;
 //////////////////////////////////////Shared MSM code for Pasta curves///////////////////////////////////////////////
 /// Native implementation of fast multiexp for platforms that do not support pasta_msm/semolina
 /// Forked from zcash/halo2
 fn cpu_multiexp_serial<C: CurveAffine>(coeffs: &[C::Scalar], bases: &[C], acc: &mut C::Curve) {
  use ff::PrimeField;
  let coeffs: Vec<_> = coeffs.iter().map(|a| a.to_repr()).collect();
  let c = if bases.len() < 4 {
    1
  } else if bases.len() < 32 {
    3
  } else {
    (f64::from(bases.len() as u32)).ln().ceil() as usize
  };
  fn get_at<F: PrimeField>(segment: usize, c: usize, bytes: &F::Repr) -> usize {
    let skip_bits = segment * c;
    let skip_bytes = skip_bits / 8;
    if skip_bytes >= 32 {
      return 0;
    }
    let mut v = [0; 8];
    for (v, o) in v.iter_mut().zip(bytes.as_ref()[skip_bytes..].iter()) {
      *v = *o;
    }
    let mut tmp = u64::from_le_bytes(v);
    tmp >>= skip_bits - (skip_bytes * 8);
    tmp %= 1 << c;
    tmp as usize
  }
  let segments = (256 / c) + 1;
  for current_segment in (0..segments).rev() {
    for _ in 0..c {
      *acc = acc.double();
    }
    #[derive(Clone, Copy)]
    enum Bucket<C: CurveAffine> {
      None,
      Affine(C),
      Projective(C::Curve),
    }
    impl<C: CurveAffine> Bucket<C> {
      fn add_assign(&mut self, other: &C) {
        *self = match *self {
          Bucket::None => Bucket::Affine(*other),
          Bucket::Affine(a) => Bucket::Projective(a + *other),
          Bucket::Projective(mut a) => {
            a += *other;
            Bucket::Projective(a)
          }
        }
      }
      fn add(self, mut other: C::Curve) -> C::Curve {
        match self {
          Bucket::None => other,
          Bucket::Affine(a) => {
            other += a;
            other
          }
          Bucket::Projective(a) => other + a,
        }
      }
    }
    let mut buckets: Vec<Bucket<C>> = vec![Bucket::None; (1 << c) - 1];
    for (coeff, base) in coeffs.iter().zip(bases.iter()) {
      let coeff = get_at::<C::Scalar>(current_segment, c, coeff);
      if coeff != 0 {
        buckets[coeff - 1].add_assign(base);
      }
    }
    // Summation by parts
    // e.g. 3a + 2b + 1c = a +
    //                    (a) + b +
    //                    ((a) + b) + c
    let mut running_sum = C::Curve::identity();
    for exp in buckets.into_iter().rev() {
      running_sum = exp.add(running_sum);
      *acc += &running_sum;
    }
  }
 }
 /// Performs a multi-exponentiation operation without GPU acceleration.
 ///
 /// This function will panic if coeffs and bases have a different length.
 ///
 /// This will use multithreading if beneficial.
 /// Forked from zcash/halo2
 fn cpu_best_multiexp<C: CurveAffine>(coeffs: &[C::Scalar], bases: &[C]) -> C::Curve {
  assert_eq!(coeffs.len(), bases.len());
  let num_threads = rayon::current_num_threads();
  if coeffs.len() > num_threads {
    let chunk = coeffs.len() / num_threads;
    let num_chunks = coeffs.chunks(chunk).len();
    let mut results = vec![C::Curve::identity(); num_chunks];
    rayon::scope(|scope| {
      let chunk = coeffs.len() / num_threads;
      for ((coeffs, bases), acc) in coeffs
        .chunks(chunk)
        .zip(bases.chunks(chunk))
        .zip(results.iter_mut())
      {
        scope.spawn(move |_| {
          cpu_multiexp_serial(coeffs, bases, acc);
        });
      }
    });
    results.iter().fold(C::Curve::identity(), |a, b| a + b)
  } else {
    let mut acc = C::Curve::identity();
    cpu_multiexp_serial(coeffs, bases, &mut acc);
    acc
  }
 }
 //////////////////////////////////////Pallas///////////////////////////////////////////////
@@ -43,6 +172,7 @@ impl Group for pallas::Point {
  type RO = PoseidonRO<Self::Base, Self::Scalar>;
  type ROCircuit = PoseidonROCircuit<Self::Base>;
  #[cfg(any(target_arch = "x86_64", target_arch = "aarch64"))]
  fn vartime_multiscalar_mul(
    scalars: &[Self::Scalar],
    bases: &[Self::PreprocessedGroupElement],
@@ -50,14 +180,18 @@ impl Group for pallas::Point {
    if scalars.len() >= 128 {
      pasta_msm::pallas(bases, scalars)
    } else {
-      scalars
+      cpu_best_multiexp(scalars, bases)
        .par_iter()
        .zip(bases)
        .map(|(scalar, base)| base.mul(scalar))
        .reduce(Ep::group_zero, |x, y| x + y)
    }
  }
  #[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
  fn vartime_multiscalar_mul(
    scalars: &[Self::Scalar],
    bases: &[Self::PreprocessedGroupElement],
  ) -> Self {
    cpu_best_multiexp(scalars, bases)
  }
  fn preprocessed(&self) -> Self::PreprocessedGroupElement {
    self.to_affine()
  }
@@ -153,6 +287,7 @@ impl Group for vesta::Point {
  type RO = PoseidonRO<Self::Base, Self::Scalar>;
  type ROCircuit = PoseidonROCircuit<Self::Base>;
  #[cfg(any(target_arch = "x86_64", target_arch = "aarch64"))]
  fn vartime_multiscalar_mul(
    scalars: &[Self::Scalar],
    bases: &[Self::PreprocessedGroupElement],
@@ -160,14 +295,18 @@ impl Group for vesta::Point {
    if scalars.len() >= 128 {
      pasta_msm::vesta(bases, scalars)
    } else {
-      scalars
+      cpu_best_multiexp(scalars, bases)
        .par_iter()
        .zip(bases)
        .map(|(scalar, base)| base.mul(scalar))
        .reduce(Eq::group_zero, |x, y| x + y)
    }
  }
  #[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
  fn vartime_multiscalar_mul(
    scalars: &[Self::Scalar],
    bases: &[Self::PreprocessedGroupElement],
  ) -> Self {
    cpu_best_multiexp(scalars, bases)
  }
  fn compress(&self) -> Self::CompressedGroupElement {
    VestaCompressedElementWrapper::new(self.to_bytes())
  }