diff --git a/CHANGELOG.md b/CHANGELOG.md
index f7cfa4e..7301fbe 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,7 +1,7 @@
 ## Pending
 
 ### Breaking changes
-- #12 Make the `ToBitsGadget` impl for `FpVar` output fixed-size
+- #12 Make the output of the `ToBitsGadget` impl for `FpVar` fixed-size
 
 ### Features
 
@@ -17,7 +17,7 @@
 - #33 Speedup scalar multiplication by a constant
 - #35 Construct a `FpVar` from bits
 - #36 Implement `ToConstraintFieldGadget` for `Vec<Uint8>`
-- #40 Faster scalar multiplication for Short Weierstrass curves by relying on affine formulae
+- #40, #43 Faster scalar multiplication for Short Weierstrass curves by relying on affine formulae
 
 ### Bug fixes
 - #8 Fix bug in `three_bit_cond_neg_lookup` when using a constant lookup bit
diff --git a/src/groups/curves/short_weierstrass/mod.rs b/src/groups/curves/short_weierstrass/mod.rs
index 47afc1f..cebaf30 100644
--- a/src/groups/curves/short_weierstrass/mod.rs
+++ b/src/groups/curves/short_weierstrass/mod.rs
@@ -2,7 +2,7 @@ use ark_ec::{
     short_weierstrass_jacobian::{GroupAffine as SWAffine, GroupProjective as SWProjective},
     AffineCurve, ProjectiveCurve, SWModelParameters,
 };
-use ark_ff::{BigInteger, BitIteratorBE, Field, FpParameters, One, PrimeField, Zero};
+use ark_ff::{BigInteger, BitIteratorBE, Field, One, PrimeField, Zero};
 use ark_relations::r1cs::{ConstraintSystemRef, Namespace, SynthesisError};
 use core::{borrow::Borrow, marker::PhantomData};
 use non_zero_affine::NonZeroAffineVar;
@@ -276,7 +276,7 @@ where
         multiple_of_power_of_two: &mut NonZeroAffineVar<P, F>,
         bits: &[&Boolean<<P::BaseField as Field>::BasePrimeField>],
     ) -> Result<(), SynthesisError> {
-        let scalar_modulus_bits = <<P::ScalarField as PrimeField>::Params>::MODULUS_BITS as usize;
+        let scalar_modulus_bits = <P::ScalarField as PrimeField>::size_in_bits();
 
         assert!(scalar_modulus_bits >= bits.len());
         let split_len = ark_std::cmp::min(scalar_modulus_bits - 2, bits.len());
@@ -317,8 +317,14 @@ where
 
         // As mentioned, we will skip the LSB, and will later handle it via a conditional subtraction.
         for bit in affine_bits.iter().skip(1) {
-            let temp = accumulator.add_unchecked(&multiple_of_power_of_two)?;
-            accumulator = bit.select(&temp, &accumulator)?;
+            if bit.is_constant() {
+                if *bit == &Boolean::TRUE {
+                    accumulator = accumulator.add_unchecked(&multiple_of_power_of_two)?;
+                }
+            } else {
+                let temp = accumulator.add_unchecked(&multiple_of_power_of_two)?;
+                accumulator = bit.select(&temp, &accumulator)?;
+            }
             multiple_of_power_of_two.double_in_place()?;
         }
         // Perform conditional subtraction:
@@ -332,8 +338,14 @@ where
 
         // Now, let's finish off the rest of the bits using our complete formulae
         for bit in proj_bits {
-            let temp = &*mul_result + &multiple_of_power_of_two.into_projective();
-            *mul_result = bit.select(&temp, &mul_result)?;
+            if bit.is_constant() {
+                if *bit == &Boolean::TRUE {
+                    *mul_result += &multiple_of_power_of_two.into_projective();
+                }
+            } else {
+                let temp = &*mul_result + &multiple_of_power_of_two.into_projective();
+                *mul_result = bit.select(&temp, &mul_result)?;
+            }
             multiple_of_power_of_two.double_in_place()?;
         }
         Ok(())
@@ -485,11 +497,23 @@ where
         // will conditionally select zero if `self` was zero.
         let non_zero_self = NonZeroAffineVar::new(x, y);
 
-        let bits = bits.collect::<Vec<_>>();
+        let mut bits = bits.collect::<Vec<_>>();
         if bits.len() == 0 {
             return Ok(Self::zero());
         }
-        let scalar_modulus_bits = <<P::ScalarField as PrimeField>::Params>::MODULUS_BITS as usize;
+        // Remove unnecessary constant zeros in the most-significant positions.
+        bits = bits
+            .into_iter()
+            // We iterate from the MSB down.
+            .rev()
+            // Skip leading zeros, if they are constants.
+            .skip_while(|b| b.is_constant() && (b.value().unwrap() == false))
+            .collect();
+        // After collecting we are in big-endian form; we have to reverse to get back to
+        // little-endian.
+        bits.reverse();
+
+        let scalar_modulus_bits = <P::ScalarField as PrimeField>::size_in_bits();
         let mut mul_result = Self::zero();
         let mut power_of_two_times_self = non_zero_self;
         // We chunk up `bits` into `p`-sized chunks.
@@ -497,7 +521,7 @@ where
             self.fixed_scalar_mul_le(&mut mul_result, &mut power_of_two_times_self, bits)?;
         }
 
-        // The foregoing algorithms rely on mixed/incomplete addition, and so do not
+        // The foregoing algorithm relies on incomplete addition, and so does not
         // work when the input (`self`) is zero. We hence have to perform
         // a check to ensure that if the input is zero, then so is the output.
         // The cost of this check should be less than the benefit of using
diff --git a/src/groups/curves/short_weierstrass/non_zero_affine.rs b/src/groups/curves/short_weierstrass/non_zero_affine.rs
index 2a69582..a598bdd 100644
--- a/src/groups/curves/short_weierstrass/non_zero_affine.rs
+++ b/src/groups/curves/short_weierstrass/non_zero_affine.rs
@@ -74,7 +74,7 @@ where
             let (x1, y1) = (&self.x, &self.y);
             let x1_sqr = x1.square()?;
             // Then,
-            // tangent lambda := (3 * x1^2 + a) / y1·;
+            // tangent lambda := (3 * x1^2 + a) / (2 * y1);
             // x3 = lambda^2 - 2x1
             // y3 = lambda * (x1 - x3) - y1
             let numerator = x1_sqr.double()? + &x1_sqr + P::COEFF_A;
@@ -86,6 +86,37 @@ where
         }
     }
 
+    /// Computes `(self + other) + self`. This method requires only 5 constraints,
+    /// less than the 7 required when computing via `self.double() + other`.
+    ///
+    /// This follows the formulae from [\[ELM03\]](https://arxiv.org/abs/math/0208038).
+    #[tracing::instrument(target = "r1cs", skip(self))]
+    pub(crate) fn double_and_add(&self, other: &Self) -> Result<Self, SynthesisError> {
+        if [self].is_constant() || other.is_constant() {
+            self.double()?.add_unchecked(other)
+        } else {
+            let (x1, y1) = (&self.x, &self.y);
+            let (x2, y2) = (&other.x, &other.y);
+
+            // Calculate self + other:
+            // slope lambda := (y2 - y1)/(x2 - x1);
+            // x3 = lambda^2 - x1 - x2;
+            // y3 = lambda * (x1 - x3) - y1
+            let numerator = y2 - y1;
+            let denominator = x2 - x1;
+            let lambda_1 = numerator.mul_by_inverse(&denominator)?;
+
+            let x3 = lambda_1.square()? - x1 - x2;
+
+            // Calculate final addition slope:
+            let lambda_2 = (lambda_1 + y1.double()?.mul_by_inverse(&(&x3 - x1))?).negate()?;
+
+            let x4 = lambda_2.square()? - x1 - x3;
+            let y4 = lambda_2 * &(x1 - &x4) - y1;
+            Ok(Self::new(x4, y4))
+        }
+    }
+
     /// Doubles `self` in place.
     #[tracing::instrument(target = "r1cs", skip(self))]
     pub(crate) fn double_in_place(&mut self) -> Result<(), SynthesisError> {