Add multi-thread bdd eval

2026-02-10 13:16:44 +01:00 · 2025-11-12 11:02:37 +01:00
parent 6924ffd94a
commit 1423de1c46
22 changed files with 336 additions and 273 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -323,6 +323,12 @@ version = "11.1.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "b410bbe7e14ab526a0e86877eb47c6996a2bd7746f027ba551028c925390e4e9"

+[[package]]
+name = "paste"
+version = "1.0.15"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "57c0d7b74b563b49d38dae00a0c37d4d6de9b432382b2892f0574ddcae73fd0a"
+
 [[package]]
 name = "plotters"
 version = "0.3.7"
@@ -406,6 +412,7 @@ dependencies = [
 "byteorder",
 "criterion",
 "itertools 0.14.0",
+ "paste",
 "poulpy-backend",
 "poulpy-core",
 "poulpy-hal",
--- a/poulpy-hal/src/api/scratch.rs
+++ b/poulpy-hal/src/api/scratch.rs
@@ -30,12 +30,24 @@ pub trait TakeSlice {

 impl<BE: Backend> Scratch<BE>
 where
-    Self: TakeSlice + ScratchFromBytes<BE>,
+    Self: TakeSlice + ScratchAvailable + ScratchFromBytes<BE>,
 {
    pub fn split_at_mut(&mut self, len: usize) -> (&mut Scratch<BE>, &mut Self) {
        let (take_slice, rem_slice) = self.take_slice(len);
        (Self::from_bytes(take_slice), rem_slice)
    }
+
+    pub fn split_mut(&mut self, n: usize, len: usize) -> (Vec<&mut Scratch<BE>>, &mut Self) {
+        assert!(self.available() >= n * len);
+        let mut scratches: Vec<&mut Scratch<BE>> = Vec::with_capacity(n);
+        let mut scratch: &mut Scratch<BE> = self;
+        for _ in 0..n {
+            let (tmp, scratch_new) = scratch.split_at_mut(len);
+            scratch = scratch_new;
+            scratches.push(tmp);
+        }
+        (scratches, scratch)
+    }
 }

 impl<B: Backend> ScratchTakeBasic for Scratch<B> where Self: TakeSlice + ScratchFromBytes<B> {}
--- a/poulpy-schemes/Cargo.toml
+++ b/poulpy-schemes/Cargo.toml
@@ -17,8 +17,8 @@ criterion = {workspace = true}
 itertools = "0.14.0"
 byteorder = "1.5.0"
 rand = "0.9.2"
-
+paste = "1.0.15"

 [[bench]]
-name = "fhe_uint_prepare"
+name = "circuit_bootstrapping"
 harness = false
--- a/poulpy-schemes/benches/fhe_uint_prepare.rs
+++ b/poulpy-schemes/benches/fhe_uint_prepare.rs
@@ -1,126 +0,0 @@
-use std::hint::black_box;
-
-use criterion::{BenchmarkId, Criterion, criterion_group, criterion_main};
-use poulpy_backend::{FFT64Avx, FFT64Ref};
-use poulpy_core::{
-    GGSWNoise, GLWEDecrypt, GLWEEncryptSk, GLWENoise, ScratchTakeCore,
-    layouts::{GGSWLayout, GLWELayout, GLWESecretPreparedFactory, prepared::GLWESecretPrepared},
-};
-use poulpy_hal::{
-    api::{ModuleNew, ScratchOwnedAlloc, ScratchOwnedBorrow},
-    layouts::{Backend, Module, Scratch, ScratchOwned},
-    source::Source,
-};
-use rand::RngCore;
-
-use poulpy_schemes::tfhe::{
-    bdd_arithmetic::{
-        BDDKeyEncryptSk, BDDKeyPrepared, BDDKeyPreparedFactory, ExecuteBDDCircuit2WTo1W, FheUint, FheUintPrepare,
-        FheUintPrepareDebug, FheUintPrepared, FheUintPreparedEncryptSk, FheUintPreparedFactory,
-        tests::test_suite::TestContext,
-    },
-    blind_rotation::{BlindRotationAlgo, BlindRotationKey, BlindRotationKeyFactory, CGGI},
-};
-
-pub fn benc_bdd_prepare<BRA: BlindRotationAlgo, BE: Backend>(
-    c: &mut Criterion,
-    label: &str,
-    test_context: &TestContext<BRA, BE>,
-) where
-    Module<BE>: ModuleNew<BE>
-        + GLWESecretPreparedFactory<BE>
-        + GLWEDecrypt<BE>
-        + GLWENoise<BE>
-        + FheUintPreparedFactory<u32, BE>
-        + FheUintPreparedEncryptSk<u32, BE>
-        + FheUintPrepareDebug<BRA, u32, BE>
-        + BDDKeyEncryptSk<BRA, BE>
-        + BDDKeyPreparedFactory<BRA, BE>
-        + GGSWNoise<BE>
-        + FheUintPrepare<BRA, BE>
-        + ExecuteBDDCircuit2WTo1W<u32, BE>
-        + GLWEEncryptSk<BE>,
-    BlindRotationKey<Vec<u8>, BRA>: BlindRotationKeyFactory<BRA>,
-    ScratchOwned<BE>: ScratchOwnedAlloc<BE> + ScratchOwnedBorrow<BE>,
-    Scratch<BE>: ScratchTakeCore<BE>,
-{
-    let group_name: String = format!("bdd_prepare::{label}");
-
-    let mut group = c.benchmark_group(group_name);
-
-    fn runner<BE: Backend, BRA: BlindRotationAlgo>(test_context: &TestContext<BRA, BE>) -> impl FnMut()
-    where
-        Module<BE>: ModuleNew<BE>
-            + GLWESecretPreparedFactory<BE>
-            + GLWEDecrypt<BE>
-            + GLWENoise<BE>
-            + FheUintPreparedFactory<u32, BE>
-            + FheUintPreparedEncryptSk<u32, BE>
-            + FheUintPrepareDebug<BRA, u32, BE>
-            + BDDKeyEncryptSk<BRA, BE>
-            + BDDKeyPreparedFactory<BRA, BE>
-            + GGSWNoise<BE>
-            + FheUintPrepare<BRA, BE>
-            + ExecuteBDDCircuit2WTo1W<u32, BE>
-            + GLWEEncryptSk<BE>,
-        BlindRotationKey<Vec<u8>, BRA>: BlindRotationKeyFactory<BRA>,
-        ScratchOwned<BE>: ScratchOwnedAlloc<BE> + ScratchOwnedBorrow<BE>,
-        Scratch<BE>: ScratchTakeCore<BE>,
-    {
-        let glwe_infos: GLWELayout = test_context.glwe_infos();
-        let ggsw_infos: GGSWLayout = test_context.ggsw_infos();
-
-        let module: &Module<BE> = &test_context.module;
-        let sk_glwe_prep: &GLWESecretPrepared<Vec<u8>, BE> = &test_context.sk_glwe;
-        let bdd_key_prepared: &BDDKeyPrepared<Vec<u8>, BRA, BE> = &test_context.bdd_key;
-
-        let mut source: Source = Source::new([6u8; 32]);
-
-        let mut source_xa: Source = Source::new([2u8; 32]);
-        let mut source_xe: Source = Source::new([3u8; 32]);
-
-        let threads = 1;
-
-        let mut scratch: ScratchOwned<BE> = ScratchOwned::alloc((1 << 22) * threads);
-
-        // GLWE(value)
-        let mut c_enc: FheUint<Vec<u8>, u32> = FheUint::alloc_from_infos(&glwe_infos);
-        let value: u32 = source.next_u32();
-        c_enc.encrypt_sk(
-            module,
-            value,
-            sk_glwe_prep,
-            &mut source_xa,
-            &mut source_xe,
-            scratch.borrow(),
-        );
-
-        // GGSW(0)
-        let mut c_enc_prep: FheUintPrepared<Vec<u8>, u32, BE> =
-            FheUintPrepared::<Vec<u8>, u32, BE>::alloc_from_infos(module, &ggsw_infos);
-
-        // GGSW(value)
-        move || {
-            c_enc_prep.prepare_custom_multi_thread(threads, module, &c_enc, 0, 32, bdd_key_prepared, scratch.borrow());
-            black_box(());
-        }
-    }
-
-    let id: BenchmarkId = BenchmarkId::from_parameter(format!("n_glwe: {}", test_context.module.n()));
-    let mut runner = runner::<BE, BRA>(test_context);
-    group.bench_with_input(id, &(), |b, _| b.iter(&mut runner));
-
-    group.finish();
-}
-
-fn bench_bdd_prepare_cpu_ref_fft64(c: &mut Criterion) {
-    benc_bdd_prepare::<CGGI, FFT64Avx>(
-        c,
-        "bdd_prepare_fft64_ref",
-        &TestContext::<CGGI, FFT64Avx>::new(),
-    );
-}
-
-criterion_group!(benches, bench_bdd_prepare_cpu_ref_fft64,);
-
-criterion_main!(benches);
--- a/poulpy-schemes/src/tfhe/bdd_arithmetic/bdd_2w_to_1w.rs
+++ b/poulpy-schemes/src/tfhe/bdd_arithmetic/bdd_2w_to_1w.rs
@@ -13,17 +13,14 @@ use crate::tfhe::bdd_arithmetic::{
    BitSize, ExecuteBDDCircuit, FheUint, FheUintPrepared, GetBitCircuitInfo, GetGGSWBit, UnsignedInteger, circuits,
 };

-impl<T: UnsignedInteger, BE: Backend> ExecuteBDDCircuit2WTo1W<T, BE> for Module<BE> where
-    Self: Sized + ExecuteBDDCircuit<T, BE> + GLWEPacking<BE> + GLWECopy
-{
-}
+impl<BE: Backend> ExecuteBDDCircuit2WTo1W<BE> for Module<BE> where Self: Sized + ExecuteBDDCircuit<BE> + GLWEPacking<BE> + GLWECopy
+{}

-pub trait ExecuteBDDCircuit2WTo1W<T: UnsignedInteger, BE: Backend>
+pub trait ExecuteBDDCircuit2WTo1W<BE: Backend>
 where
-    Self: Sized + ModuleLogN + ExecuteBDDCircuit<T, BE> + GLWEPacking<BE> + GLWECopy,
+    Self: Sized + ModuleLogN + ExecuteBDDCircuit<BE> + GLWEPacking<BE> + GLWECopy,
 {
-    /// Operations Z x Z -> Z
-    fn execute_bdd_circuit_2w_to_1w<R, C, A, B, K, H>(
+    fn execute_bdd_circuit_2w_to_1w<R, C, A, B, K, H, T>(
        &self,
        out: &mut FheUint<R, T>,
        circuit: &C,
@@ -32,6 +29,31 @@ where
        key: &H,
        scratch: &mut Scratch<BE>,
    ) where
+        T: UnsignedInteger,
+        C: GetBitCircuitInfo<T>,
+        R: DataMut,
+        A: DataRef,
+        B: DataRef,
+        K: GGLWEPreparedToRef<BE> + GetGaloisElement + GGLWEInfos,
+        H: GLWEAutomorphismKeyHelper<K, BE>,
+        Scratch<BE>: ScratchTakeCore<BE>,
+    {
+        self.execute_bdd_circuit_2w_to_1w_multi_thread(1, out, circuit, a, b, key, scratch);
+    }
+
+    #[allow(clippy::too_many_arguments)]
+    /// Operations Z x Z -> Z
+    fn execute_bdd_circuit_2w_to_1w_multi_thread<R, C, A, B, K, H, T>(
+        &self,
+        threads: usize,
+        out: &mut FheUint<R, T>,
+        circuit: &C,
+        a: &FheUintPrepared<A, T, BE>,
+        b: &FheUintPrepared<B, T, BE>,
+        key: &H,
+        scratch: &mut Scratch<BE>,
+    ) where
+        T: UnsignedInteger,
        C: GetBitCircuitInfo<T>,
        R: DataMut,
        A: DataRef,
@@ -50,7 +72,7 @@ where
        let (mut out_bits, scratch_1) = scratch.take_glwe_slice(T::BITS as usize, out);

        // Evaluates out[i] = circuit[i](a, b)
-        self.execute_bdd_circuit(&mut out_bits, &helper, circuit, scratch_1);
+        self.execute_bdd_circuit_multi_thread(threads, &mut out_bits, &helper, circuit, scratch_1);

        // Repacks the bits
        out.pack(self, out_bits, key, scratch_1);
@@ -100,22 +122,43 @@ where
 #[macro_export]
 macro_rules! define_bdd_2w_to_1w_trait {
    ($(#[$meta:meta])* $vis:vis $trait_name:ident, $method_name:ident) => {
-        $(#[$meta])*
-        $vis trait $trait_name<T: UnsignedInteger, BE: Backend> {
-            fn $method_name<A, M, K, H, B>(
-                &mut self,
-                module: &M,
-                a: &FheUintPrepared<A, T, BE>,
-                b: &FheUintPrepared<B, T, BE>,
-                key: &H,
-                scratch: &mut Scratch<BE>,
-            ) where
-                M: ExecuteBDDCircuit2WTo1W<T, BE>,
-                A: DataRef,
-                B: DataRef,
-                K: GGLWEPreparedToRef<BE> + GetGaloisElement + GGLWEInfos,
-                H: GLWEAutomorphismKeyHelper<K, BE>,
-                Scratch<BE>: ScratchTakeCore<BE>;
+        paste::paste! {
+            $(#[$meta])*
+            $vis trait $trait_name<T: UnsignedInteger, BE: Backend> {
+
+                /// Single-threaded version
+                fn $method_name<A, M, K, H, B>(
+                    &mut self,
+                    module: &M,
+                    a: &FheUintPrepared<A, T, BE>,
+                    b: &FheUintPrepared<B, T, BE>,
+                    key: &H,
+                    scratch: &mut Scratch<BE>,
+                ) where
+                    M: ExecuteBDDCircuit2WTo1W<BE>,
+                    A: DataRef,
+                    B: DataRef,
+                    K: GGLWEPreparedToRef<BE> + GetGaloisElement + GGLWEInfos,
+                    H: GLWEAutomorphismKeyHelper<K, BE>,
+                    Scratch<BE>: ScratchTakeCore<BE>;
+
+                /// Multithreaded version – same vis, method_name + "_multi_thread"
+                fn [<$method_name _multi_thread>]<A, M, K, H, B>(
+                    &mut self,
+                    threads: usize,
+                    module: &M,
+                    a: &FheUintPrepared<A, T, BE>,
+                    b: &FheUintPrepared<B, T, BE>,
+                    key: &H,
+                    scratch: &mut Scratch<BE>,
+                ) where
+                    M: ExecuteBDDCircuit2WTo1W<BE>,
+                    A: DataRef,
+                    B: DataRef,
+                    K: GGLWEPreparedToRef<BE> + GetGaloisElement + GGLWEInfos,
+                    H: GLWEAutomorphismKeyHelper<K, BE>,
+                    Scratch<BE>: ScratchTakeCore<BE>;
+            }
        }
    };
 }
@@ -123,23 +166,45 @@ macro_rules! define_bdd_2w_to_1w_trait {
 #[macro_export]
 macro_rules! impl_bdd_2w_to_1w_trait {
    ($trait_name:ident, $method_name:ident, $ty:ty, $circuit_ty:ty, $output_circuits:path) => {
-        impl<D: DataMut, BE: Backend> $trait_name<$ty, BE> for FheUint<D, $ty> {
-            fn $method_name<A, M, K, H, B>(
-                &mut self,
-                module: &M,
-                a: &FheUintPrepared<A, $ty, BE>,
-                b: &FheUintPrepared<B, $ty, BE>,
-                key: &H,
-                scratch: &mut Scratch<BE>,
-            ) where
-                M: ExecuteBDDCircuit2WTo1W<$ty, BE>,
-                A: DataRef,
-                B: DataRef,
-                K: GGLWEPreparedToRef<BE> + GetGaloisElement + GGLWEInfos,
-                H: GLWEAutomorphismKeyHelper<K, BE>,
-                Scratch<BE>: ScratchTakeCore<BE>,
-            {
-                module.execute_bdd_circuit_2w_to_1w(self, &$output_circuits, a, b, key, scratch)
+        paste::paste! {
+            impl<D: DataMut, BE: Backend> $trait_name<$ty, BE> for FheUint<D, $ty> {
+
+                fn $method_name<A, M, K, H, B>(
+                    &mut self,
+                    module: &M,
+                    a: &FheUintPrepared<A, $ty, BE>,
+                    b: &FheUintPrepared<B, $ty, BE>,
+                    key: &H,
+                    scratch: &mut Scratch<BE>,
+                ) where
+                    M: ExecuteBDDCircuit2WTo1W<BE>,
+                    A: DataRef,
+                    B: DataRef,
+                    K: GGLWEPreparedToRef<BE> + GetGaloisElement + GGLWEInfos,
+                    H: GLWEAutomorphismKeyHelper<K, BE>,
+                    Scratch<BE>: ScratchTakeCore<BE>,
+                {
+                    module.execute_bdd_circuit_2w_to_1w(self, &$output_circuits, a, b, key, scratch)
+                }
+
+                fn [<$method_name _multi_thread>]<A, M, K, H, B>(
+                    &mut self,
+                    threads: usize,
+                    module: &M,
+                    a: &FheUintPrepared<A, $ty, BE>,
+                    b: &FheUintPrepared<B, $ty, BE>,
+                    key: &H,
+                    scratch: &mut Scratch<BE>,
+                ) where
+                    M: ExecuteBDDCircuit2WTo1W<BE>,
+                    A: DataRef,
+                    B: DataRef,
+                    K: GGLWEPreparedToRef<BE> + GetGaloisElement + GGLWEInfos,
+                    H: GLWEAutomorphismKeyHelper<K, BE>,
+                    Scratch<BE>: ScratchTakeCore<BE>,
+                {
+                    module.execute_bdd_circuit_2w_to_1w_multi_thread(threads, self, &$output_circuits, a, b, key, scratch)
+                }
            }
        }
    };
--- a/poulpy-schemes/src/tfhe/bdd_arithmetic/ciphertexts/fhe_uint_prepared.rs
+++ b/poulpy-schemes/src/tfhe/bdd_arithmetic/ciphertexts/fhe_uint_prepared.rs
@@ -33,7 +33,7 @@ pub struct FheUintPrepared<D: Data, T: UnsignedInteger, B: Backend> {

 impl<T: UnsignedInteger, BE: Backend> FheUintPreparedFactory<T, BE> for Module<BE> where Self: Sized + GGSWPreparedFactory<BE> {}

-pub trait GetGGSWBit<BE: Backend> {
+pub trait GetGGSWBit<BE: Backend>: Sync {
    fn get_bit(&self, bit: usize) -> GGSWPrepared<&[u8], BE>;
 }

@@ -222,7 +222,14 @@ impl<D: DataMut, BRA: BlindRotationAlgo, BE: Backend> BDDKeyPrepared<D, BRA, BE>
 }

 pub trait FheUintPrepare<BRA: BlindRotationAlgo, BE: Backend> {
-    fn fhe_uint_prepare_tmp_bytes<R, A, B>(&self, block_size: usize, extension_factor: usize, res_infos: &R, bits_infos: &A, bdd_infos: &B) -> usize
+    fn fhe_uint_prepare_tmp_bytes<R, A, B>(
+        &self,
+        block_size: usize,
+        extension_factor: usize,
+        res_infos: &R,
+        bits_infos: &A,
+        bdd_infos: &B,
+    ) -> usize
    where
        R: GGSWInfos,
        A: GLWEInfos,
@@ -258,6 +265,7 @@ pub trait FheUintPrepare<BRA: BlindRotationAlgo, BE: Backend> {
    {
        self.fhe_uint_prepare_custom_multi_thread(1, res, bits, bit_start, bit_count, key, scratch)
    }
+    #[allow(clippy::too_many_arguments)]
    fn fhe_uint_prepare_custom_multi_thread<DM, DB, DK, K, T: UnsignedInteger>(
        &self,
        threads: usize,
@@ -279,7 +287,14 @@ where
    Self: LWEFromGLWE<BE> + CirtuitBootstrappingExecute<BRA, BE> + GGSWPreparedFactory<BE>,
    Scratch<BE>: ScratchTakeCore<BE>,
 {
-    fn fhe_uint_prepare_tmp_bytes<R, A, B>(&self, block_size: usize, extension_factor: usize, res_infos: &R, bits_infos: &A, bdd_infos: &B) -> usize
+    fn fhe_uint_prepare_tmp_bytes<R, A, B>(
+        &self,
+        block_size: usize,
+        extension_factor: usize,
+        res_infos: &R,
+        bits_infos: &A,
+        bdd_infos: &B,
+    ) -> usize
    where
        R: GGSWInfos,
        A: GLWEInfos,
@@ -302,7 +317,7 @@ where
        bit_start: usize,
        bit_count: usize,
        key: &K,
-        mut scratch: &mut Scratch<BE>,
+        scratch: &mut Scratch<BE>,
    ) where
        DM: DataMut,
        DB: DataRef,
@@ -318,16 +333,9 @@ where

        assert!(scratch.available() >= threads * scratch_thread_size);

-        // How many bits we need to process
-        let chunk_size: usize = bit_count.div_ceil(threads); // ceil division
+        let chunk_size: usize = bit_count.div_ceil(threads);

-        let mut scratches = Vec::new();
-        for _ in 0..(threads - 1) {
-            let (tmp, scratch_new) = scratch.split_at_mut(scratch_thread_size);
-            scratch = scratch_new;
-            scratches.push(tmp);
-        }
-        scratches.push(scratch);
+        let (mut scratches, _) = scratch.split_mut(threads, scratch_thread_size);

        let ggsw_infos: &GGSWLayout = &res.ggsw_layout();

@@ -392,6 +400,7 @@ impl<D: DataMut, T: UnsignedInteger, BE: Backend> FheUintPrepared<D, T, BE> {
        module.fhe_uint_prepare_custom(self, other, bit_start, bit_end, key, scratch);
    }

+    #[allow(clippy::too_many_arguments)]
    pub fn prepare_custom_multi_thread<BRA, M, O, K, DK>(
        &mut self,
        threads: usize,
--- a/poulpy-schemes/src/tfhe/bdd_arithmetic/ciphertexts/fhe_uint_prepared_debug.rs
+++ b/poulpy-schemes/src/tfhe/bdd_arithmetic/ciphertexts/fhe_uint_prepared_debug.rs
@@ -125,12 +125,12 @@ where
        DR0: DataRef,
        DR1: DataRef,
    {
-
        let (_, scratch_1) = scratch.take_ggsw(res);
        let (mut tmp_lwe, scratch_2) = scratch_1.take_lwe(bits);
        for (bit, dst) in res.bits.iter_mut().enumerate() {
            bits.get_bit_lwe(self, bit, &mut tmp_lwe, &key.ks, scratch_2);
-            key.cbt.execute_to_constant(self, dst, &tmp_lwe, 1, 1, scratch_2);
+            key.cbt
+                .execute_to_constant(self, dst, &tmp_lwe, 1, 1, scratch_2);
        }
    }
 }
--- a/poulpy-schemes/src/tfhe/bdd_arithmetic/eval.rs
+++ b/poulpy-schemes/src/tfhe/bdd_arithmetic/eval.rs
@@ -1,4 +1,5 @@
 use core::panic;
+use std::thread;

 use itertools::Itertools;
 use poulpy_core::{
@@ -6,17 +7,20 @@ use poulpy_core::{
    layouts::{GGSWInfos, GGSWPrepared, GLWE, GLWEInfos, GLWEToMut, GLWEToRef, LWEInfos, prepared::GGSWPreparedToRef},
 };
 use poulpy_hal::{
-    api::{ScratchTakeBasic, VecZnxBigAddSmallInplace, VecZnxBigNormalize, VecZnxBigNormalizeTmpBytes, VecZnxDftBytesOf},
+    api::{
+        ScratchAvailable, ScratchTakeBasic, VecZnxBigAddSmallInplace, VecZnxBigNormalize, VecZnxBigNormalizeTmpBytes,
+        VecZnxDftBytesOf,
+    },
    layouts::{Backend, DataMut, Module, Scratch, VecZnxBig, ZnxZero},
 };

 use crate::tfhe::bdd_arithmetic::{GetGGSWBit, UnsignedInteger};

-pub trait BitCircuitInfo {
+pub trait BitCircuitInfo: Sync {
    fn info(&self) -> (&[Node], usize);
 }

-pub trait GetBitCircuitInfo<T: UnsignedInteger> {
+pub trait GetBitCircuitInfo<T: UnsignedInteger>: Sync {
    fn input_size(&self) -> usize;
    fn output_size(&self) -> usize;
    fn get_circuit(&self, bit: usize) -> (&[Node], usize);
@@ -49,9 +53,34 @@ where
    }
 }

-pub trait ExecuteBDDCircuit<T: UnsignedInteger, BE: Backend> {
-    fn execute_bdd_circuit<C, G, O>(&self, out: &mut [GLWE<O>], inputs: &G, circuit: &C, scratch: &mut Scratch<BE>)
+pub trait ExecuteBDDCircuit<BE: Backend> {
+    fn execute_bdd_circuit_tmp_bytes<R, G>(&self, res_infos: &R, state_size: usize, ggsw_infos: &G) -> usize
    where
+        R: GLWEInfos,
+        G: GGSWInfos;
+
+    fn execute_bdd_circuit<C, G, O, T: UnsignedInteger>(
+        &self,
+        out: &mut [GLWE<O>],
+        inputs: &G,
+        circuit: &C,
+        scratch: &mut Scratch<BE>,
+    ) where
+        G: GetGGSWBit<BE> + BitSize,
+        C: GetBitCircuitInfo<T>,
+        O: DataMut,
+    {
+        self.execute_bdd_circuit_multi_thread(1, out, inputs, circuit, scratch);
+    }
+
+    fn execute_bdd_circuit_multi_thread<C, G, O, T: UnsignedInteger>(
+        &self,
+        threads: usize,
+        out: &mut [GLWE<O>],
+        inputs: &G,
+        circuit: &C,
+        scratch: &mut Scratch<BE>,
+    ) where
        G: GetGGSWBit<BE> + BitSize,
        C: GetBitCircuitInfo<T>,
        O: DataMut;
@@ -61,13 +90,27 @@ pub trait BitSize {
    fn bit_size(&self) -> usize;
 }

-impl<T: UnsignedInteger, BE: Backend> ExecuteBDDCircuit<T, BE> for Module<BE>
+impl<BE: Backend> ExecuteBDDCircuit<BE> for Module<BE>
 where
    Self: Cmux<BE> + GLWECopy,
    Scratch<BE>: ScratchTakeCore<BE>,
 {
-    fn execute_bdd_circuit<C, G, O>(&self, out: &mut [GLWE<O>], inputs: &G, circuit: &C, scratch: &mut Scratch<BE>)
+    fn execute_bdd_circuit_tmp_bytes<R, G>(&self, res_infos: &R, state_size: usize, ggsw_infos: &G) -> usize
    where
+        R: GLWEInfos,
+        G: GGSWInfos,
+    {
+        2 * state_size * GLWE::bytes_of_from_infos(res_infos) + self.cmux_tmp_bytes(res_infos, res_infos, ggsw_infos)
+    }
+
+    fn execute_bdd_circuit_multi_thread<C, G, O, T: UnsignedInteger>(
+        &self,
+        threads: usize,
+        out: &mut [GLWE<O>],
+        inputs: &G,
+        circuit: &C,
+        scratch: &mut Scratch<BE>,
+    ) where
        G: GetGGSWBit<BE> + BitSize,
        C: GetBitCircuitInfo<T>,
        O: DataMut,
@@ -88,66 +131,43 @@ where
            );
        }

-        for (i, out_i) in out.iter_mut().enumerate().take(circuit.output_size()) {
-            let (nodes, max_inter_state) = circuit.get_circuit(i);
+        let mut max_state_size = 0;
+        for i in 0..circuit.output_size() {
+            let (_, state_size) = circuit.get_circuit(i);
+            max_state_size = max_state_size.max(state_size)
+        }

-            if max_inter_state == 0 {
-                out_i.data_mut().zero();
-            } else {
-                assert!(nodes.len().is_multiple_of(max_inter_state));
+        let scratch_thread_size: usize = self.execute_bdd_circuit_tmp_bytes(&out[0], max_state_size, &inputs.get_bit(0));

-                let (mut level, scratch_1) = scratch.take_glwe_slice(max_inter_state * 2, out_i);
+        assert!(
+            scratch.available() >= threads * scratch_thread_size,
+            "scratch.available(): {} < threads:{threads} * scratch_thread_size: {scratch_thread_size}",
+            scratch.available()
+        );

-                level.iter_mut().for_each(|ct| ct.data_mut().zero());
+        let (mut scratches, _) = scratch.split_mut(threads, scratch_thread_size);

-                // TODO: implement API on GLWE
-                level[1]
-                    .data_mut()
-                    .encode_coeff_i64(out_i.base2k().into(), 0, 2, 0, 1);
+        let chunk_size: usize = circuit.output_size().div_ceil(threads);

-                let mut level_ref = level.iter_mut().collect_vec();
-                let (mut prev_level, mut next_level) = level_ref.split_at_mut(max_inter_state);
+        thread::scope(|scope| {
+            for (scratch_thread, out_chunk) in scratches
+                .iter_mut()
+                .zip(out[..circuit.output_size()].chunks_mut(chunk_size))
+            {
+                // Capture chunk + thread scratch by move
+                scope.spawn(move || {
+                    for (idx, out_i) in out_chunk.iter_mut().enumerate() {
+                        let (nodes, state_size) = circuit.get_circuit(idx);

-                let (all_but_last, last) = nodes.split_at(nodes.len() - max_inter_state);
-
-                for nodes_lvl in all_but_last.chunks_exact(max_inter_state) {
-                    for (j, node) in nodes_lvl.iter().enumerate() {
-                        match node {
-                            Node::Cmux(in_idx, hi_idx, lo_idx) => {
-                                self.cmux(
-                                    next_level[j],
-                                    prev_level[*hi_idx],
-                                    prev_level[*lo_idx],
-                                    &inputs.get_bit(*in_idx),
-                                    scratch_1,
-                                );
-                            }
-                            Node::Copy => self.glwe_copy(next_level[j], prev_level[j]), /* Update BDD circuits to order Cmux -> Copy -> None so that mem swap can be used */
-                            Node::None => {}
+                        if state_size == 0 {
+                            out_i.data_mut().zero();
+                        } else {
+                            eval_level(self, out_i, inputs, nodes, state_size, *scratch_thread);
                        }
                    }
-
-                    (prev_level, next_level) = (next_level, prev_level);
-                }
-
-                // Last chunck of max_inter_state Nodes is always structured as
-                // [CMUX, NONE, NONE, ..., NONE]
-                match &last[0] {
-                    Node::Cmux(in_idx, hi_idx, lo_idx) => {
-                        self.cmux(
-                            out_i,
-                            prev_level[*hi_idx],
-                            prev_level[*lo_idx],
-                            &inputs.get_bit(*in_idx),
-                            scratch_1,
-                        );
-                    }
-                    _ => {
-                        panic!("invalid last node, should be CMUX")
-                    }
-                }
+                });
            }
-        }
+        });

        for out_i in out.iter_mut().skip(circuit.output_size()) {
            out_i.data_mut().zero();
@@ -155,6 +175,74 @@ where
    }
 }

+fn eval_level<M, R, G, BE: Backend>(
+    module: &M,
+    res: &mut R,
+    inputs: &G,
+    nodes: &[Node],
+    state_size: usize,
+    scratch: &mut Scratch<BE>,
+) where
+    M: Cmux<BE> + GLWECopy,
+    R: GLWEToMut,
+    G: GetGGSWBit<BE> + BitSize,
+    Scratch<BE>: ScratchTakeCore<BE>,
+{
+    assert!(nodes.len().is_multiple_of(state_size));
+    let res: &mut GLWE<&mut [u8]> = &mut res.to_mut();
+
+    let (mut level, scratch_1) = scratch.take_glwe_slice(state_size * 2, res);
+
+    level.iter_mut().for_each(|ct| ct.data_mut().zero());
+
+    // TODO: implement API on GLWE
+    level[1]
+        .data_mut()
+        .encode_coeff_i64(res.base2k().into(), 0, 2, 0, 1);
+
+    let mut level_ref: Vec<&mut GLWE<&mut [u8]>> = level.iter_mut().collect_vec();
+    let (mut prev_level, mut next_level) = level_ref.split_at_mut(state_size);
+
+    let (all_but_last, last) = nodes.split_at(nodes.len() - state_size);
+
+    for nodes_lvl in all_but_last.chunks_exact(state_size) {
+        for (j, node) in nodes_lvl.iter().enumerate() {
+            match node {
+                Node::Cmux(in_idx, hi_idx, lo_idx) => {
+                    module.cmux(
+                        next_level[j],
+                        prev_level[*hi_idx],
+                        prev_level[*lo_idx],
+                        &inputs.get_bit(*in_idx),
+                        scratch_1,
+                    );
+                }
+                Node::Copy => module.glwe_copy(next_level[j], prev_level[j]), /* Update BDD circuits to order Cmux -> Copy -> None so that mem swap can be used */
+                Node::None => {}
+            }
+        }
+
+        (prev_level, next_level) = (next_level, prev_level);
+    }
+
+    // Last chunck of max_inter_state Nodes is always structured as
+    // [CMUX, NONE, NONE, ..., NONE]
+    match &last[0] {
+        Node::Cmux(in_idx, hi_idx, lo_idx) => {
+            module.cmux(
+                res,
+                prev_level[*hi_idx],
+                prev_level[*lo_idx],
+                &inputs.get_bit(*in_idx),
+                scratch_1,
+            );
+        }
+        _ => {
+            panic!("invalid last node, should be CMUX")
+        }
+    }
+}
+
 impl<const N: usize> BitCircuit<N> {
    pub const fn new(nodes: [Node; N], max_inter_state: usize) -> Self {
        Self {
--- a/poulpy-schemes/src/tfhe/bdd_arithmetic/key.rs
+++ b/poulpy-schemes/src/tfhe/bdd_arithmetic/key.rs
@@ -136,17 +136,21 @@ where
    pub(crate) ks: GLWEToLWEKeyPrepared<D, BE>,
 }

-impl<D: DataRef, BRA: BlindRotationAlgo, BE: Backend> BDDKeyInfos for BDDKeyPrepared<D, BRA, BE>{
+impl<D: DataRef, BRA: BlindRotationAlgo, BE: Backend> BDDKeyInfos for BDDKeyPrepared<D, BRA, BE> {
    fn cbt_infos(&self) -> CircuitBootstrappingKeyLayout {
-        CircuitBootstrappingKeyLayout { layout_brk: self.cbt.brk_infos(), layout_atk: self.cbt.atk_infos(), layout_tsk: self.cbt.tsk_infos() }
+        CircuitBootstrappingKeyLayout {
+            layout_brk: self.cbt.brk_infos(),
+            layout_atk: self.cbt.atk_infos(),
+            layout_tsk: self.cbt.tsk_infos(),
+        }
    }
    fn ks_infos(&self) -> GLWEToLWEKeyLayout {
-        GLWEToLWEKeyLayout{
+        GLWEToLWEKeyLayout {
            n: self.ks.n(),
            base2k: self.ks.base2k(),
            k: self.ks.k(),
            rank_in: self.ks.rank_in(),
-            dnum: self.ks.dnum()
+            dnum: self.ks.dnum(),
        }
    }
 }
--- a/poulpy-schemes/src/tfhe/bdd_arithmetic/tests/test_suite/add.rs
+++ b/poulpy-schemes/src/tfhe/bdd_arithmetic/tests/test_suite/add.rs
@@ -31,7 +31,7 @@ where
        + BDDKeyPreparedFactory<BRA, BE>
        + GGSWNoise<BE>
        + FheUintPrepare<BRA, BE>
-        + ExecuteBDDCircuit2WTo1W<u32, BE>
+        + ExecuteBDDCircuit2WTo1W<BE>
        + GLWEEncryptSk<BE>,
    BlindRotationKey<Vec<u8>, BRA>: BlindRotationKeyFactory<BRA>,
    ScratchOwned<BE>: ScratchOwnedAlloc<BE> + ScratchOwnedBorrow<BE>,
--- a/poulpy-schemes/src/tfhe/bdd_arithmetic/tests/test_suite/and.rs
+++ b/poulpy-schemes/src/tfhe/bdd_arithmetic/tests/test_suite/and.rs
@@ -31,7 +31,7 @@ where
        + BDDKeyPreparedFactory<BRA, BE>
        + GGSWNoise<BE>
        + FheUintPrepare<BRA, BE>
-        + ExecuteBDDCircuit2WTo1W<u32, BE>
+        + ExecuteBDDCircuit2WTo1W<BE>
        + GLWEEncryptSk<BE>,
    BlindRotationKey<Vec<u8>, BRA>: BlindRotationKeyFactory<BRA>,
    ScratchOwned<BE>: ScratchOwnedAlloc<BE> + ScratchOwnedBorrow<BE>,
--- a/poulpy-schemes/src/tfhe/bdd_arithmetic/tests/test_suite/mod.rs
+++ b/poulpy-schemes/src/tfhe/bdd_arithmetic/tests/test_suite/mod.rs
@@ -133,8 +133,8 @@ impl<BRA: BlindRotationAlgo, BE: Backend> TestContext<BRA, BE> {
    }
 }

-pub(crate) const TEST_N_GLWE: u32 = 1024;
-pub(crate) const TEST_N_LWE: u32 = 574;
+pub(crate) const TEST_N_GLWE: u32 = 256;
+pub(crate) const TEST_N_LWE: u32 = 77;
 pub(crate) const TEST_BASE2K: u32 = 13;
 pub(crate) const TEST_K_GLWE: u32 = 26;
 pub(crate) const TEST_K_GGSW: u32 = 39;
--- a/poulpy-schemes/src/tfhe/bdd_arithmetic/tests/test_suite/or.rs
+++ b/poulpy-schemes/src/tfhe/bdd_arithmetic/tests/test_suite/or.rs
@@ -31,7 +31,7 @@ where
        + BDDKeyPreparedFactory<BRA, BE>
        + GGSWNoise<BE>
        + FheUintPrepare<BRA, BE>
-        + ExecuteBDDCircuit2WTo1W<u32, BE>
+        + ExecuteBDDCircuit2WTo1W<BE>
        + GLWEEncryptSk<BE>,
    BlindRotationKey<Vec<u8>, BRA>: BlindRotationKeyFactory<BRA>,
    ScratchOwned<BE>: ScratchOwnedAlloc<BE> + ScratchOwnedBorrow<BE>,
--- a/poulpy-schemes/src/tfhe/bdd_arithmetic/tests/test_suite/prepare.rs
+++ b/poulpy-schemes/src/tfhe/bdd_arithmetic/tests/test_suite/prepare.rs
@@ -31,7 +31,7 @@ where
        + BDDKeyPreparedFactory<BRA, BE>
        + GGSWNoise<BE>
        + FheUintPrepare<BRA, BE>
-        + ExecuteBDDCircuit2WTo1W<u32, BE>
+        + ExecuteBDDCircuit2WTo1W<BE>
        + GLWEEncryptSk<BE>,
    BlindRotationKey<Vec<u8>, BRA>: BlindRotationKeyFactory<BRA>,
    ScratchOwned<BE>: ScratchOwnedAlloc<BE> + ScratchOwnedBorrow<BE>,
--- a/poulpy-schemes/src/tfhe/bdd_arithmetic/tests/test_suite/sll.rs
+++ b/poulpy-schemes/src/tfhe/bdd_arithmetic/tests/test_suite/sll.rs
@@ -31,7 +31,7 @@ where
        + BDDKeyPreparedFactory<BRA, BE>
        + GGSWNoise<BE>
        + FheUintPrepare<BRA, BE>
-        + ExecuteBDDCircuit2WTo1W<u32, BE>
+        + ExecuteBDDCircuit2WTo1W<BE>
        + GLWEEncryptSk<BE>,
    BlindRotationKey<Vec<u8>, BRA>: BlindRotationKeyFactory<BRA>,
    ScratchOwned<BE>: ScratchOwnedAlloc<BE> + ScratchOwnedBorrow<BE>,
--- a/poulpy-schemes/src/tfhe/bdd_arithmetic/tests/test_suite/slt.rs
+++ b/poulpy-schemes/src/tfhe/bdd_arithmetic/tests/test_suite/slt.rs
@@ -31,7 +31,7 @@ where
        + BDDKeyPreparedFactory<BRA, BE>
        + GGSWNoise<BE>
        + FheUintPrepare<BRA, BE>
-        + ExecuteBDDCircuit2WTo1W<u32, BE>
+        + ExecuteBDDCircuit2WTo1W<BE>
        + GLWEEncryptSk<BE>,
    BlindRotationKey<Vec<u8>, BRA>: BlindRotationKeyFactory<BRA>,
    ScratchOwned<BE>: ScratchOwnedAlloc<BE> + ScratchOwnedBorrow<BE>,
--- a/poulpy-schemes/src/tfhe/bdd_arithmetic/tests/test_suite/sltu.rs
+++ b/poulpy-schemes/src/tfhe/bdd_arithmetic/tests/test_suite/sltu.rs
@@ -31,7 +31,7 @@ where
        + BDDKeyPreparedFactory<BRA, BE>
        + GGSWNoise<BE>
        + FheUintPrepare<BRA, BE>
-        + ExecuteBDDCircuit2WTo1W<u32, BE>
+        + ExecuteBDDCircuit2WTo1W<BE>
        + GLWEEncryptSk<BE>,
    BlindRotationKey<Vec<u8>, BRA>: BlindRotationKeyFactory<BRA>,
    ScratchOwned<BE>: ScratchOwnedAlloc<BE> + ScratchOwnedBorrow<BE>,
--- a/poulpy-schemes/src/tfhe/bdd_arithmetic/tests/test_suite/sra.rs
+++ b/poulpy-schemes/src/tfhe/bdd_arithmetic/tests/test_suite/sra.rs
@@ -31,7 +31,7 @@ where
        + BDDKeyPreparedFactory<BRA, BE>
        + GGSWNoise<BE>
        + FheUintPrepare<BRA, BE>
-        + ExecuteBDDCircuit2WTo1W<u32, BE>
+        + ExecuteBDDCircuit2WTo1W<BE>
        + GLWEEncryptSk<BE>,
    BlindRotationKey<Vec<u8>, BRA>: BlindRotationKeyFactory<BRA>,
    ScratchOwned<BE>: ScratchOwnedAlloc<BE> + ScratchOwnedBorrow<BE>,
--- a/poulpy-schemes/src/tfhe/bdd_arithmetic/tests/test_suite/srl.rs
+++ b/poulpy-schemes/src/tfhe/bdd_arithmetic/tests/test_suite/srl.rs
@@ -31,7 +31,7 @@ where
        + BDDKeyPreparedFactory<BRA, BE>
        + GGSWNoise<BE>
        + FheUintPrepare<BRA, BE>
-        + ExecuteBDDCircuit2WTo1W<u32, BE>
+        + ExecuteBDDCircuit2WTo1W<BE>
        + GLWEEncryptSk<BE>,
    BlindRotationKey<Vec<u8>, BRA>: BlindRotationKeyFactory<BRA>,
    ScratchOwned<BE>: ScratchOwnedAlloc<BE> + ScratchOwnedBorrow<BE>,
--- a/poulpy-schemes/src/tfhe/bdd_arithmetic/tests/test_suite/sub.rs
+++ b/poulpy-schemes/src/tfhe/bdd_arithmetic/tests/test_suite/sub.rs
@@ -31,7 +31,7 @@ where
        + BDDKeyPreparedFactory<BRA, BE>
        + GGSWNoise<BE>
        + FheUintPrepare<BRA, BE>
-        + ExecuteBDDCircuit2WTo1W<u32, BE>
+        + ExecuteBDDCircuit2WTo1W<BE>
        + GLWEEncryptSk<BE>,
    BlindRotationKey<Vec<u8>, BRA>: BlindRotationKeyFactory<BRA>,
    ScratchOwned<BE>: ScratchOwnedAlloc<BE> + ScratchOwnedBorrow<BE>,
--- a/poulpy-schemes/src/tfhe/bdd_arithmetic/tests/test_suite/xor.rs
+++ b/poulpy-schemes/src/tfhe/bdd_arithmetic/tests/test_suite/xor.rs
@@ -31,7 +31,7 @@ where
        + BDDKeyPreparedFactory<BRA, BE>
        + GGSWNoise<BE>
        + FheUintPrepare<BRA, BE>
-        + ExecuteBDDCircuit2WTo1W<u32, BE>
+        + ExecuteBDDCircuit2WTo1W<BE>
        + GLWEEncryptSk<BE>,
    BlindRotationKey<Vec<u8>, BRA>: BlindRotationKeyFactory<BRA>,
    ScratchOwned<BE>: ScratchOwnedAlloc<BE> + ScratchOwnedBorrow<BE>,
--- a/poulpy-schemes/src/tfhe/circuit_bootstrapping/circuit.rs
+++ b/poulpy-schemes/src/tfhe/circuit_bootstrapping/circuit.rs
@@ -8,7 +8,8 @@ use poulpy_hal::{
 use poulpy_core::{
    GGSWFromGGLWE, GLWEDecrypt, GLWEPacking, GLWERotate, GLWETrace, ScratchTakeCore,
    layouts::{
-        Dsize, GGLWE, GGLWEInfos, GGLWELayout, GGLWEPreparedToRef, GGSWInfos, GGSWToMut, GLWEAutomorphismKeyHelper, GLWEInfos, GLWESecretPreparedFactory, GLWEToMut, GLWEToRef, GetGaloisElement, LWEInfos, LWEToRef, Rank
+        Dsize, GGLWE, GGLWEInfos, GGLWELayout, GGLWEPreparedToRef, GGSWInfos, GGSWToMut, GLWEAutomorphismKeyHelper, GLWEInfos,
+        GLWESecretPreparedFactory, GLWEToMut, GLWEToRef, GetGaloisElement, LWEInfos, LWEToRef, Rank,
    },
 };

@@ -131,14 +132,13 @@ where
        R: GGSWInfos,
        A: CircuitBootstrappingKeyInfos,
    {
-
        let gglwe_infos: GGLWELayout = GGLWELayout {
            n: res_infos.n(),
            base2k: res_infos.base2k(),
            k: res_infos.k(),
            dnum: res_infos.dnum(),
            dsize: Dsize(1),
-            rank_in: res_infos.rank().max(Rank(1)).into(),
+            rank_in: res_infos.rank().max(Rank(1)),
            rank_out: res_infos.rank(),
        };

@@ -149,7 +149,9 @@ where
            &cbt_infos.brk_infos(),
        )
        .max(self.glwe_trace_tmp_bytes(res_infos, res_infos, &cbt_infos.atk_infos()))
-        .max(self.ggsw_from_gglwe_tmp_bytes(res_infos, &cbt_infos.tsk_infos())) + GLWE::bytes_of_from_infos(res_infos) + GGLWE::bytes_of_from_infos(&gglwe_infos)
+        .max(self.ggsw_from_gglwe_tmp_bytes(res_infos, &cbt_infos.tsk_infos()))
+            + GLWE::bytes_of_from_infos(res_infos)
+            + GGLWE::bytes_of_from_infos(&gglwe_infos)
    }

    fn circuit_bootstrapping_execute_to_constant<R, L, D>(
@@ -165,8 +167,9 @@ where
        L: LWEToRef + LWEInfos,
        D: DataRef,
    {
-
-        assert!(scratch.available() >= self.circuit_bootstrapping_execute_tmp_bytes(key.block_size(), extension_factor, res, key));
+        assert!(
+            scratch.available() >= self.circuit_bootstrapping_execute_tmp_bytes(key.block_size(), extension_factor, res, key)
+        );

        circuit_bootstrap_core(
            false,
@@ -195,8 +198,9 @@ where
        L: LWEToRef + LWEInfos,
        D: DataRef,
    {
-
-        assert!(scratch.available() >= self.circuit_bootstrapping_execute_tmp_bytes(key.block_size(), extension_factor, res, key));
+        assert!(
+            scratch.available() >= self.circuit_bootstrapping_execute_tmp_bytes(key.block_size(), extension_factor, res, key)
+        );

        circuit_bootstrap_core(
            true,