Merge pull request #123 from phantomzone-org/non-avx-build

Improvement to non-avx/x86 platform
2026-02-10 05:06:44 +01:00 · 2025-11-21 17:16:44 +01:00
parent 8a039e1c3a 3c818d292b
commit 0ce56938fc
25 changed files with 370 additions and 113 deletions
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -11,7 +11,7 @@ jobs:
    runs-on: ubuntu-latest

    steps:
-    - name: Checkout code
+    - name: Checkout
      uses: actions/checkout@v4
      with:
        submodules: recursive
@@ -21,7 +21,7 @@ jobs:
      with:
        components: clippy, rustfmt

-    - name: Cache cargo dependencies
+    - name: Cache cargo deps
      uses: actions/cache@v4
      with:
        path: |
@@ -32,14 +32,48 @@ jobs:
        restore-keys: |
          ${{ runner.os }}-cargo-

-    - name: Build
-      run: cargo build --all-targets
-
-    - name: Clippy (deny warnings)
-      run: cargo clippy --workspace --all-targets --all-features
+    # Detect whether runner supports AVX2 + FMA
+    - name: Detect AVX support
+      id: avxcheck
+      run: |
+        if lscpu | grep -qi avx2 && lscpu | grep -qi fma; then
+          echo "supported=true" >> $GITHUB_OUTPUT
+        else
+          echo "supported=false" >> $GITHUB_OUTPUT
+        fi

+    # rustfmt always runs — unrelated to AVX support
    - name: rustfmt (check only)
      run: cargo fmt --all --check

-    - name: Run tests
-      run: cargo test --all
+    # Build / lint / test WITH AVX
+    - name: Build (AVX enabled)
+      if: steps.avxcheck.outputs.supported == 'true'
+      run: |
+        RUSTFLAGS="-C target-feature=+avx2,+fma" \
+        cargo build --workspace --all-targets --features enable-avx
+
+    - name: Clippy (AVX enabled)
+      if: steps.avxcheck.outputs.supported == 'true'
+      run: |
+        RUSTFLAGS="-C target-feature=+avx2,+fma" \
+        cargo clippy --workspace --all-targets --features enable-avx -- -D warnings
+
+    - name: Tests (AVX enabled)
+      if: steps.avxcheck.outputs.supported == 'true'
+      run: |
+        RUSTFLAGS="-C target-feature=+avx2,+fma" \
+        cargo test --workspace --features enable-avx
+
+    # Build / lint / test WITHOUT AVX
+    - name: Build (portable mode)
+      if: steps.avxcheck.outputs.supported == 'false'
+      run: cargo build --workspace --all-targets
+
+    - name: Clippy (portable mode)
+      if: steps.avxcheck.outputs.supported == 'false'
+      run: cargo clippy --workspace --all-targets -- -D warnings
+
+    - name: Tests (portable mode)
+      if: steps.avxcheck.outputs.supported == 'false'
+      run: cargo test --workspace
--- a/poulpy-core/Cargo.toml
+++ b/poulpy-core/Cargo.toml
@@ -8,12 +8,16 @@ repository = "https://github.com/phantomzone-org/poulpy"
 homepage = "https://github.com/phantomzone-org/poulpy"
 documentation = "https://docs.rs/poulpy"

+[features]
+enable-avx = ["dep:poulpy-cpu-avx"]
+default = ["dep:poulpy-cpu-ref"]
+
 [dependencies]
 rug = {workspace = true}
 criterion = {workspace = true}
 poulpy-hal = {workspace = true}
-poulpy-cpu-avx = {workspace = true}
-poulpy-cpu-ref = {workspace = true}
+poulpy-cpu-avx = {workspace = true, optional = true}
+poulpy-cpu-ref = {workspace = true, optional = true}
 itertools = {workspace = true}
 byteorder = {workspace = true}
 bytemuck = {workspace = true}
--- a/poulpy-core/benches/external_product_glwe_fft64.rs
+++ b/poulpy-core/benches/external_product_glwe_fft64.rs
@@ -6,7 +6,12 @@ use std::hint::black_box;

 use criterion::{BenchmarkId, Criterion, criterion_group, criterion_main};

-use poulpy_cpu_ref::FFT64Ref;
+#[cfg(all(feature = "enable-avx", target_arch = "x86_64"))]
+pub use poulpy_cpu_avx::FFT64Avx as BackendImpl;
+
+#[cfg(not(all(feature = "enable-avx", target_arch = "x86_64")))]
+pub use poulpy_cpu_ref::FFT64Ref as BackendImpl;
+
 use poulpy_hal::{
    api::{ModuleNew, ScratchOwnedAlloc, ScratchOwnedBorrow},
    layouts::{Module, ScalarZnx, ScratchOwned},
@@ -26,7 +31,7 @@ fn bench_external_product_glwe_fft64(c: &mut Criterion) {
    }

    fn runner(p: Params) -> impl FnMut() {
-        let module: Module<FFT64Ref> = Module::<FFT64Ref>::new(1 << p.log_n);
+        let module: Module<BackendImpl> = Module::<BackendImpl>::new(1 << p.log_n);

        let n: Degree = Degree(module.n() as u32);
        let base2k: Base2K = p.base2k;
@@ -42,8 +47,8 @@ fn bench_external_product_glwe_fft64(c: &mut Criterion) {
            n,
            base2k,
            k: k_ggsw,
-            dnum: dnum,
-            dsize: dsize,
+            dnum,
+            dsize,
            rank,
        };

@@ -66,7 +71,7 @@ fn bench_external_product_glwe_fft64(c: &mut Criterion) {
        let mut ct_glwe_out: GLWE<Vec<u8>> = GLWE::alloc_from_infos(&glwe_out_layout);
        let pt_rgsw: ScalarZnx<Vec<u8>> = ScalarZnx::alloc(n.into(), 1);

-        let mut scratch: ScratchOwned<FFT64Ref> = ScratchOwned::alloc(
+        let mut scratch: ScratchOwned<BackendImpl> = ScratchOwned::alloc(
            GGSW::encrypt_sk_tmp_bytes(&module, &ggsw_layout)
                | GLWE::encrypt_sk_tmp_bytes(&module, &glwe_in_layout)
                | GLWE::external_product_tmp_bytes(&module, &glwe_out_layout, &glwe_in_layout, &ggsw_layout),
@@ -79,7 +84,7 @@ fn bench_external_product_glwe_fft64(c: &mut Criterion) {
        let mut sk: GLWESecret<Vec<u8>> = GLWESecret::alloc_from_infos(&glwe_in_layout);
        sk.fill_ternary_prob(0.5, &mut source_xs);

-        let mut sk_dft: GLWESecretPrepared<Vec<u8>, FFT64Ref> = GLWESecretPrepared::alloc(&module, rank);
+        let mut sk_dft: GLWESecretPrepared<Vec<u8>, BackendImpl> = GLWESecretPrepared::alloc(&module, rank);
        sk_dft.prepare(&module, &sk);

        ct_ggsw.encrypt_sk(
@@ -99,7 +104,7 @@ fn bench_external_product_glwe_fft64(c: &mut Criterion) {
            scratch.borrow(),
        );

-        let mut ggsw_prepared: GGSWPrepared<Vec<u8>, FFT64Ref> = GGSWPrepared::alloc_from_infos(&module, &ct_ggsw);
+        let mut ggsw_prepared: GGSWPrepared<Vec<u8>, BackendImpl> = GGSWPrepared::alloc_from_infos(&module, &ct_ggsw);
        ggsw_prepared.prepare(&module, &ct_ggsw, scratch.borrow());

        move || {
@@ -138,7 +143,7 @@ fn bench_external_product_glwe_inplace_fft64(c: &mut Criterion) {
    }

    fn runner(p: Params) -> impl FnMut() {
-        let module: Module<FFT64Ref> = Module::<FFT64Ref>::new(1 << p.log_n);
+        let module: Module<BackendImpl> = Module::<BackendImpl>::new(1 << p.log_n);

        let n: Degree = Degree(module.n() as u32);
        let base2k: Base2K = p.base2k;
@@ -153,8 +158,8 @@ fn bench_external_product_glwe_inplace_fft64(c: &mut Criterion) {
            n,
            base2k,
            k: k_ggsw,
-            dnum: dnum,
-            dsize: dsize,
+            dnum,
+            dsize,
            rank,
        };

@@ -169,7 +174,7 @@ fn bench_external_product_glwe_inplace_fft64(c: &mut Criterion) {
        let mut ct_glwe: GLWE<Vec<u8>> = GLWE::alloc_from_infos(&glwe_layout);
        let pt_rgsw: ScalarZnx<Vec<u8>> = ScalarZnx::alloc(n.into(), 1);

-        let mut scratch: ScratchOwned<FFT64Ref> = ScratchOwned::alloc(
+        let mut scratch: ScratchOwned<BackendImpl> = ScratchOwned::alloc(
            GGSW::encrypt_sk_tmp_bytes(&module, &ggsw_layout)
                | GLWE::encrypt_sk_tmp_bytes(&module, &glwe_layout)
                | GLWE::external_product_tmp_bytes(&module, &glwe_layout, &glwe_layout, &ggsw_layout),
@@ -182,7 +187,7 @@ fn bench_external_product_glwe_inplace_fft64(c: &mut Criterion) {
        let mut sk: GLWESecret<Vec<u8>> = GLWESecret::alloc_from_infos(&glwe_layout);
        sk.fill_ternary_prob(0.5, &mut source_xs);

-        let mut sk_dft: GLWESecretPrepared<Vec<u8>, FFT64Ref> = GLWESecretPrepared::alloc(&module, rank);
+        let mut sk_dft: GLWESecretPrepared<Vec<u8>, BackendImpl> = GLWESecretPrepared::alloc(&module, rank);
        sk_dft.prepare(&module, &sk);

        ct_ggsw.encrypt_sk(
@@ -202,7 +207,7 @@ fn bench_external_product_glwe_inplace_fft64(c: &mut Criterion) {
            scratch.borrow(),
        );

-        let mut ggsw_prepared: GGSWPrepared<Vec<u8>, FFT64Ref> = GGSWPrepared::alloc_from_infos(&module, &ct_ggsw);
+        let mut ggsw_prepared: GGSWPrepared<Vec<u8>, BackendImpl> = GGSWPrepared::alloc_from_infos(&module, &ct_ggsw);
        ggsw_prepared.prepare(&module, &ct_ggsw, scratch.borrow());
        move || {
            let scratch_borrow = scratch.borrow();
--- a/poulpy-core/benches/keyswitch_glwe_fft64.rs
+++ b/poulpy-core/benches/keyswitch_glwe_fft64.rs
@@ -6,7 +6,13 @@ use poulpy_core::layouts::{
 use std::{hint::black_box, time::Duration};

 use criterion::{BenchmarkId, Criterion, criterion_group, criterion_main};
-use poulpy_cpu_ref::FFT64Ref;
+
+#[cfg(all(feature = "enable-avx", target_arch = "x86_64"))]
+pub use poulpy_cpu_avx::FFT64Avx as BackendImpl;
+
+#[cfg(not(all(feature = "enable-avx", target_arch = "x86_64")))]
+pub use poulpy_cpu_ref::FFT64Ref as BackendImpl;
+
 use poulpy_hal::{
    api::{ModuleNew, ScratchOwnedAlloc, ScratchOwnedBorrow},
    layouts::{Module, ScratchOwned},
@@ -27,7 +33,7 @@ fn bench_keyswitch_glwe_fft64(c: &mut Criterion) {
    }

    fn runner(p: Params) -> impl FnMut() {
-        let module: Module<FFT64Ref> = Module::<FFT64Ref>::new(1 << p.log_n);
+        let module: Module<BackendImpl> = Module::<BackendImpl>::new(1 << p.log_n);

        let n: Degree = Degree(module.n() as u32);
        let base2k: Base2K = p.base2k;
@@ -66,7 +72,7 @@ fn bench_keyswitch_glwe_fft64(c: &mut Criterion) {
        let mut ct_in: GLWE<Vec<u8>> = GLWE::alloc_from_infos(&glwe_in_layout);
        let mut ct_out: GLWE<Vec<u8>> = GLWE::alloc_from_infos(&glwe_out_layout);

-        let mut scratch: ScratchOwned<FFT64Ref> = ScratchOwned::alloc(
+        let mut scratch: ScratchOwned<BackendImpl> = ScratchOwned::alloc(
            GLWESwitchingKey::encrypt_sk_tmp_bytes(&module, &gglwe_atk_layout)
                | GLWE::encrypt_sk_tmp_bytes(&module, &glwe_in_layout)
                | GLWE::keyswitch_tmp_bytes(
@@ -84,7 +90,7 @@ fn bench_keyswitch_glwe_fft64(c: &mut Criterion) {
        let mut sk_in: GLWESecret<Vec<u8>> = GLWESecret::alloc_from_infos(&glwe_in_layout);
        sk_in.fill_ternary_prob(0.5, &mut source_xs);

-        let mut sk_in_dft: GLWESecretPrepared<Vec<u8>, FFT64Ref> = GLWESecretPrepared::alloc(&module, rank);
+        let mut sk_in_dft: GLWESecretPrepared<Vec<u8>, BackendImpl> = GLWESecretPrepared::alloc(&module, rank);
        sk_in_dft.prepare(&module, &sk_in);

        ksk.encrypt_sk(
@@ -150,7 +156,7 @@ fn bench_keyswitch_glwe_inplace_fft64(c: &mut Criterion) {
    }

    fn runner(p: Params) -> impl FnMut() {
-        let module: Module<FFT64Ref> = Module::<FFT64Ref>::new(1 << p.log_n);
+        let module: Module<BackendImpl> = Module::<BackendImpl>::new(1 << p.log_n);

        let n: Degree = Degree(module.n() as u32);
        let base2k: Base2K = p.base2k;
@@ -181,7 +187,7 @@ fn bench_keyswitch_glwe_inplace_fft64(c: &mut Criterion) {
        let mut ksk: GLWESwitchingKey<Vec<u8>> = GLWESwitchingKey::alloc_from_infos(&gglwe_layout);
        let mut ct: GLWE<Vec<u8>> = GLWE::alloc_from_infos(&glwe_layout);

-        let mut scratch: ScratchOwned<FFT64Ref> = ScratchOwned::alloc(
+        let mut scratch: ScratchOwned<BackendImpl> = ScratchOwned::alloc(
            GLWESwitchingKey::encrypt_sk_tmp_bytes(&module, &gglwe_layout)
                | GLWE::encrypt_sk_tmp_bytes(&module, &glwe_layout)
                | GLWE::keyswitch_tmp_bytes(&module, &glwe_layout, &glwe_layout, &gglwe_layout),
@@ -194,7 +200,7 @@ fn bench_keyswitch_glwe_inplace_fft64(c: &mut Criterion) {
        let mut sk_in: GLWESecret<Vec<u8>> = GLWESecret::alloc_from_infos(&glwe_layout);
        sk_in.fill_ternary_prob(0.5, &mut source_xs);

-        let mut sk_in_dft: GLWESecretPrepared<Vec<u8>, FFT64Ref> = GLWESecretPrepared::alloc(&module, rank);
+        let mut sk_in_dft: GLWESecretPrepared<Vec<u8>, BackendImpl> = GLWESecretPrepared::alloc(&module, rank);
        sk_in_dft.prepare(&module, &sk_in);

        let mut sk_out: GLWESecret<Vec<u8>> = GLWESecret::alloc_from_infos(&glwe_layout);
--- a/poulpy-core/examples/encryption.rs
+++ b/poulpy-core/examples/encryption.rs
@@ -5,7 +5,13 @@ use poulpy_core::{
        prepared::GLWESecretPrepared,
    },
 };
-use poulpy_cpu_ref::FFT64Ref;
+
+#[cfg(all(feature = "enable-avx", target_arch = "x86_64"))]
+pub use poulpy_cpu_avx::FFT64Avx as BackendImpl;
+
+#[cfg(not(all(feature = "enable-avx", target_arch = "x86_64")))]
+pub use poulpy_cpu_ref::FFT64Ref as BackendImpl;
+
 use poulpy_hal::{
    api::{ModuleNew, ScratchOwnedAlloc, ScratchOwnedBorrow, VecZnxFillUniform},
    layouts::{Module, ScratchOwned},
@@ -31,7 +37,7 @@ fn main() {
    let rank: Rank = Rank(1);

    // Instantiate Module (DFT Tables)
-    let module: Module<FFT64Ref> = Module::<FFT64Ref>::new(n.0 as u64);
+    let module: Module<BackendImpl> = Module::<BackendImpl>::new(n.0 as u64);

    let glwe_ct_infos: GLWELayout = GLWELayout {
        n,
@@ -53,7 +59,7 @@ fn main() {
    let mut source_xa: Source = Source::new([2u8; 32]);

    // Scratch space
-    let mut scratch: ScratchOwned<FFT64Ref> = ScratchOwned::alloc(
+    let mut scratch: ScratchOwned<BackendImpl> = ScratchOwned::alloc(
        GLWE::encrypt_sk_tmp_bytes(&module, &glwe_ct_infos) | GLWE::decrypt_tmp_bytes(&module, &glwe_ct_infos),
    );

@@ -62,7 +68,7 @@ fn main() {
    sk.fill_ternary_prob(0.5, &mut source_xs);

    // Backend-prepared secret
-    let mut sk_prepared: GLWESecretPrepared<Vec<u8>, FFT64Ref> = GLWESecretPrepared::alloc(&module, rank);
+    let mut sk_prepared: GLWESecretPrepared<Vec<u8>, BackendImpl> = GLWESecretPrepared::alloc(&module, rank);
    sk_prepared.prepare(&module, &sk);

    // Uniform plaintext
--- a/poulpy-core/src/scratch.rs
+++ b/poulpy-core/src/scratch.rs
@@ -358,7 +358,7 @@ where
        let pairs: u32 = (((infos.rank_out().0 + 1) * infos.rank_out().0) >> 1).max(1);
        let mut ksk_infos: GGLWELayout = infos.gglwe_layout();
        ksk_infos.rank_in = Rank(pairs);
-        let (data, scratch) = self.take_gglwe(infos);
+        let (data, scratch) = self.take_gglwe(&ksk_infos);
        (GLWETensorKey(data), scratch)
    }

@@ -377,7 +377,7 @@ where
        let pairs: u32 = (((infos.rank_out().0 + 1) * infos.rank_out().0) >> 1).max(1);
        let mut ksk_infos: GGLWELayout = infos.gglwe_layout();
        ksk_infos.rank_in = Rank(pairs);
-        let (data, scratch) = self.take_gglwe_prepared(module, infos);
+        let (data, scratch) = self.take_gglwe_prepared(module, &ksk_infos);
        (GLWETensorKeyPrepared(data), scratch)
    }
 }
--- a/poulpy-core/src/tests/mod.rs
+++ b/poulpy-core/src/tests/mod.rs
@@ -4,10 +4,10 @@ pub mod test_suite;
 mod serialization;

 #[cfg(test)]
+#[cfg(all(feature = "enable-avx", target_arch = "x86_64"))]
 mod poulpy_core {
    use poulpy_hal::backend_test_suite;

-    #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
    backend_test_suite!(
    mod cpu_avx,
    backend = poulpy_cpu_avx::FFT64Avx,
@@ -69,8 +69,13 @@ mod poulpy_core {
        lwe_to_glwe => crate::tests::test_suite::test_lwe_to_glwe,
    }
    );
+}
+
+#[cfg(test)]
+#[cfg(not(all(feature = "enable-avx", target_arch = "x86_64")))]
+mod poulpy_core {
+    use poulpy_hal::backend_test_suite;

-    #[cfg(not(any(target_arch = "x86_64", target_arch = "x86")))]
    backend_test_suite!(
    mod cpu_ref,
    backend = poulpy_cpu_ref::FFT64Ref,
--- a/poulpy-cpu-avx/Cargo.toml
+++ b/poulpy-cpu-avx/Cargo.toml
@@ -9,6 +9,9 @@ repository = "https://github.com/phantomzone-org/poulpy"
 homepage = "https://github.com/phantomzone-org/poulpy"
 documentation = "https://docs.rs/poulpy"

+[features]
+enable-avx = []
+
 [dependencies]
 poulpy-hal = {workspace = true}
 poulpy-cpu-ref = {workspace = true}
--- a/poulpy-cpu-avx/README.md
+++ b/poulpy-cpu-avx/README.md
@@ -1,8 +1,51 @@
-# 🐙 Poulpy-CPU-REF
+# 🐙 Poulpy-CPU-AVX

-**Poulpy-Backend-CPU-AVX** is a Rust crate that provides an AVX accelerated CPU implementation of **`poulpy-hal`**. This crate is used to instantiate projects implemented with **`poulpy-hal`**, **`poulpy-core`** and/or **`poulpy-schemes`**.
+**Poulpy-CPU-AVX** is a Rust crate that provides an **AVX2 + FMA accelerated CPU backend for Poulpy**.

-## Example
+This backend implements the Poulpy HAL extension traits and can be used by:
+
+- [`poulpy-hal`](https://github.com/phantomzone-org/poulpy/tree/main/poulpy-hal)
+- [`poulpy-core`](https://github.com/phantomzone-org/poulpy/tree/main/poulpy-core)
+- [`poulpy-schemes`](https://github.com/phantomzone-org/poulpy/tree/main/poulpy-schemes)
+
+## 🚩 Safety and Requirements
+
+To avoid illegal hardware instructions (SIGILL) on unsupported CPUs, this backend is **opt-in** and **only builds when explicitly requested**.
+
+| Requirement | Status |
+|------------|--------|
+| Cargo feature flag | `--features enable-avx` **must be enabled** |
+| CPU architecture | `x86_64` |
+| CPU target features | `AVX2` + `FMA` |
+
+If `enable-avx` is enabled but the target does not provide these capabilities, the build **fails immediately with a clear error message**, rather than generating invalid binaries.
+
+When `enable-avx` is **not** enabled, this crate is simply skipped and Poulpy automatically falls back to the portable `poulpy-cpu-ref` backend. This ensure that Poulpy's workspace remains portable (e.g. for macOS ARM).
+
+## ⚙️ Building with the AVX backend enabled
+
+Because the compiler must generate AVX2 + FMA instructions, both the Cargo feature and CPU target flags must be specified:
+
+```bash
+RUSTFLAGS="-C target-feature=+avx2,+fma" \
+cargo build --features enable-avx
+````
+
+### Running an example
+
+```bash
+RUSTFLAGS="-C target-feature=+avx2,+fma" \
+cargo run --example <name> --features enable-avx
+```
+
+### Running benchmarks
+
+```bash
+RUSTFLAGS="-C target-feature=+avx2,+fma" \
+cargo bench --features enable-avx
+```
+
+## Basic Usage

 ```rust
 use poulpy_backend_cpu_avx::FFT64Avx;
@@ -12,7 +55,24 @@ let log_n: usize = 10;
 let module = Module<FFT64Avx> = Module<FFT64Avx>::new(1<<log_n);
 ```

-## Contributors
+Once compiled with `enable-avx`, the backend is usable transparently anywhere Poulpy expects a backend type (`poulpy-hal`, `poulpy-core`, `poulpy-schemes`).

-To add your own backend, implement the open extension traits from **`poulpy-hal/oep`** for a struct that implements the `Backend` trait.  
-This will automatically make your backend compatible with the API of **`poulpy-hal`**, **`poulpy-core`** and **`poulpy-schemes`**.
+## 🤝 Contributors
+
+To implement your own Poulpy backend (SIMD or accelerator):
+
+1. Define a backend struct
+2. Implement the open extension traits from `poulpy-hal/oep`
+3. Implement the `Backend` trait
+
+Your backend will automatically integrate with:
+
+* `poulpy-hal`
+* `poulpy-core`
+* `poulpy-schemes`
+
+No modifications to those crates are required — the HAL provides the extension points.
+
+---
+
+For questions or guidance, feel free to open an issue or discussion in the repository.
--- a/poulpy-cpu-avx/benches/fft.rs
+++ b/poulpy-cpu-avx/benches/fft.rs
@@ -1,10 +1,17 @@
-use std::hint::black_box;
+use criterion::{Criterion, criterion_group, criterion_main};

-use criterion::{BenchmarkId, Criterion, criterion_group, criterion_main};
-use poulpy_cpu_avx::{ReimFFTAvx, ReimIFFTAvx};
-use poulpy_hal::reference::fft64::reim::{ReimDFTExecute, ReimFFTTable, ReimIFFTTable};
+#[cfg(not(all(feature = "enable-avx", target_arch = "x86_64", target_feature = "avx2", target_feature = "fma")))]
+fn bench_ifft_avx2_fma(_c: &mut Criterion) {
+    eprintln!("Skipping: AVX IFft benchmark requires x86_64 + AVX2 + FMA");
+}

+#[cfg(all(feature = "enable-avx", target_arch = "x86_64", target_feature = "avx2", target_feature = "fma"))]
 pub fn bench_ifft_avx2_fma(c: &mut Criterion) {
+    use criterion::BenchmarkId;
+    use poulpy_cpu_avx::ReimIFFTAvx;
+    use poulpy_hal::reference::fft64::reim::{ReimDFTExecute, ReimIFFTTable};
+    use std::hint::black_box;
+
    let group_name: String = "ifft_avx2_fma".to_string();

    let mut group = c.benchmark_group(group_name);
@@ -40,7 +47,18 @@ pub fn bench_ifft_avx2_fma(c: &mut Criterion) {
    group.finish();
 }

+#[cfg(not(all(feature = "enable-avx", target_arch = "x86_64", target_feature = "avx2", target_feature = "fma")))]
+fn bench_fft_avx2_fma(_c: &mut Criterion) {
+    eprintln!("Skipping: AVX FFT benchmark requires x86_64 + AVX2 + FMA");
+}
+
+#[cfg(all(feature = "enable-avx", target_arch = "x86_64", target_feature = "avx2", target_feature = "fma"))]
 pub fn bench_fft_avx2_fma(c: &mut Criterion) {
+    use criterion::BenchmarkId;
+    use poulpy_cpu_avx::ReimFFTAvx;
+    use poulpy_hal::reference::fft64::reim::{ReimDFTExecute, ReimFFTTable};
+    use std::hint::black_box;
+
    let group_name: String = "fft_avx2_fma".to_string();

    let mut group = c.benchmark_group(group_name);
--- a/poulpy-cpu-avx/benches/vec_znx.rs
+++ b/poulpy-cpu-avx/benches/vec_znx.rs
@@ -1,20 +1,36 @@
-// poulpy-backend/benches/vec_znx_add.rs
 use criterion::{Criterion, criterion_group, criterion_main};
-use poulpy_cpu_avx::FFT64Avx;
-use poulpy_hal::reference::vec_znx::{bench_vec_znx_add, bench_vec_znx_automorphism, bench_vec_znx_normalize_inplace};

-#[allow(dead_code)]
+#[cfg(not(all(feature = "enable-avx", target_arch = "x86_64", target_feature = "avx2", target_feature = "fma")))]
+fn bench_vec_znx_add_cpu_avx_fft64(_c: &mut Criterion) {
+    eprintln!("Skipping: AVX IFft benchmark requires x86_64 + AVX2 + FMA");
+}
+
+#[cfg(all(feature = "enable-avx", target_arch = "x86_64", target_feature = "avx2", target_feature = "fma"))]
 fn bench_vec_znx_add_cpu_avx_fft64(c: &mut Criterion) {
-    bench_vec_znx_add::<FFT64Avx>(c, "FFT64Avx");
+    use poulpy_cpu_avx::FFT64Avx;
+    poulpy_hal::reference::vec_znx::bench_vec_znx_add::<FFT64Avx>(c, "FFT64Avx");
 }

-#[allow(dead_code)]
+#[cfg(not(all(feature = "enable-avx", target_arch = "x86_64", target_feature = "avx2", target_feature = "fma")))]
+fn bench_vec_znx_normalize_inplace_cpu_avx_fft64(_c: &mut Criterion) {
+    eprintln!("Skipping: AVX IFft benchmark requires x86_64 + AVX2 + FMA");
+}
+
+#[cfg(all(feature = "enable-avx", target_arch = "x86_64", target_feature = "avx2", target_feature = "fma"))]
 fn bench_vec_znx_normalize_inplace_cpu_avx_fft64(c: &mut Criterion) {
-    bench_vec_znx_normalize_inplace::<FFT64Avx>(c, "FFT64Avx");
+    use poulpy_cpu_avx::FFT64Avx;
+    poulpy_hal::reference::vec_znx::bench_vec_znx_normalize_inplace::<FFT64Avx>(c, "FFT64Avx");
 }

+#[cfg(not(all(feature = "enable-avx", target_arch = "x86_64", target_feature = "avx2", target_feature = "fma")))]
+fn bench_vec_znx_automorphism_cpu_avx_fft64(_c: &mut Criterion) {
+    eprintln!("Skipping: AVX IFft benchmark requires x86_64 + AVX2 + FMA");
+}
+
+#[cfg(all(feature = "enable-avx", target_arch = "x86_64", target_feature = "avx2", target_feature = "fma"))]
 fn bench_vec_znx_automorphism_cpu_avx_fft64(c: &mut Criterion) {
-    bench_vec_znx_automorphism::<FFT64Avx>(c, "FFT64Avx");
+    use poulpy_cpu_avx::FFT64Avx;
+    poulpy_hal::reference::vec_znx::bench_vec_znx_automorphism::<FFT64Avx>(c, "FFT64Avx");
 }

 criterion_group!(
--- a/poulpy-cpu-avx/benches/vmp.rs
+++ b/poulpy-cpu-avx/benches/vmp.rs
@@ -1,10 +1,14 @@
-// poulpy-backend/benches/vec_znx_add.rs
 use criterion::{Criterion, criterion_group, criterion_main};
-use poulpy_cpu_avx::FFT64Avx;
-use poulpy_hal::bench_suite::vmp::bench_vmp_apply_dft_to_dft;

+#[cfg(not(all(feature = "enable-avx", target_arch = "x86_64", target_feature = "avx2", target_feature = "fma")))]
+fn bench_vmp_apply_dft_to_dft_cpu_avx_fft64(_c: &mut Criterion) {
+    eprintln!("Skipping: AVX IFft benchmark requires x86_64 + AVX2 + FMA");
+}
+
+#[cfg(all(feature = "enable-avx", target_arch = "x86_64", target_feature = "avx2", target_feature = "fma"))]
 fn bench_vmp_apply_dft_to_dft_cpu_avx_fft64(c: &mut Criterion) {
-    bench_vmp_apply_dft_to_dft::<FFT64Avx>(c, "FFT64Avx");
+    use poulpy_cpu_avx::FFT64Avx;
+    poulpy_hal::bench_suite::vmp::bench_vmp_apply_dft_to_dft::<FFT64Avx>(c, "FFT64Avx");
 }

 criterion_group!(benches_x86, bench_vmp_apply_dft_to_dft_cpu_avx_fft64,);
--- a/poulpy-cpu-avx/examples/rlwe_encrypt.rs
+++ b/poulpy-cpu-avx/examples/rlwe_encrypt.rs
@@ -1,5 +1,10 @@
 use itertools::izip;
-use poulpy_cpu_avx::FFT64Avx;
+
+#[cfg(all(feature = "enable-avx", target_arch = "x86_64", target_feature = "avx2", target_feature = "fma"))]
+use poulpy_cpu_avx::FFT64Avx as BackendImpl;
+#[cfg(not(all(feature = "enable-avx", target_arch = "x86_64", target_feature = "avx2", target_feature = "fma")))]
+use poulpy_cpu_ref::FFT64Ref as BackendImpl;
+
 use poulpy_hal::{
    api::{
        ModuleNew, ScratchOwnedAlloc, ScratchOwnedBorrow, SvpApplyDftToDftInplace, SvpPPolAlloc, SvpPrepare, VecZnxAddNormal,
@@ -16,9 +21,9 @@ fn main() {
    let ct_size: usize = 3;
    let msg_size: usize = 2;
    let log_scale: usize = msg_size * base2k - 5;
-    let module: Module<FFT64Avx> = Module::<FFT64Avx>::new(n as u64);
+    let module: Module<BackendImpl> = Module::<BackendImpl>::new(n as u64);

-    let mut scratch: ScratchOwned<FFT64Avx> = ScratchOwned::<FFT64Avx>::alloc(module.vec_znx_big_normalize_tmp_bytes());
+    let mut scratch: ScratchOwned<BackendImpl> = ScratchOwned::<BackendImpl>::alloc(module.vec_znx_big_normalize_tmp_bytes());

    let seed: [u8; 32] = [0; 32];
    let mut source: Source = Source::new(seed);
@@ -28,7 +33,7 @@ fn main() {
    s.fill_ternary_prob(0, 0.5, &mut source);

    // Buffer to store s in the DFT domain
-    let mut s_dft: SvpPPol<Vec<u8>, FFT64Avx> = module.svp_ppol_alloc(s.cols());
+    let mut s_dft: SvpPPol<Vec<u8>, BackendImpl> = module.svp_ppol_alloc(s.cols());

    // s_dft <- DFT(s)
    module.svp_prepare(&mut s_dft, 0, &s, 0);
@@ -43,7 +48,7 @@ fn main() {
    // Fill the second column with random values: ct = (0, a)
    module.vec_znx_fill_uniform(base2k, &mut ct, 1, &mut source);

-    let mut buf_dft: VecZnxDft<Vec<u8>, FFT64Avx> = module.vec_znx_dft_alloc(1, ct_size);
+    let mut buf_dft: VecZnxDft<Vec<u8>, BackendImpl> = module.vec_znx_dft_alloc(1, ct_size);

    module.vec_znx_dft_apply(1, 0, &mut buf_dft, 0, &ct, 1);

@@ -58,7 +63,7 @@ fn main() {
    // Alias scratch space (VecZnxDft<B> is always at least as big as VecZnxBig<B>)

    // BIG(ct[1] * s) <- IDFT(DFT(ct[1] * s)) (not normalized)
-    let mut buf_big: VecZnxBig<Vec<u8>, FFT64Avx> = module.vec_znx_big_alloc(1, ct_size);
+    let mut buf_big: VecZnxBig<Vec<u8>, BackendImpl> = module.vec_znx_big_alloc(1, ct_size);
    module.vec_znx_idft_apply_tmpa(&mut buf_big, 0, &mut buf_dft, 0);

    // Creates a plaintext: VecZnx with 1 column
--- a/poulpy-cpu-avx/src/lib.rs
+++ b/poulpy-cpu-avx/src/lib.rs
@@ -1,3 +1,20 @@
+// ─────────────────────────────────────────────────────────────
+// Build the backend **only when ALL conditions are satisfied**
+// ─────────────────────────────────────────────────────────────
+#![cfg(all(feature = "enable-avx", target_arch = "x86_64", target_feature = "avx2", target_feature = "fma"))]
+
+// If the user enables this backend but targets a non-x86_64 CPU → abort
+#[cfg(all(feature = "enable-avx", not(target_arch = "x86_64")))]
+compile_error!("feature `enable-avx` requires target_arch = \"x86_64\".");
+
+// If the user enables this backend but AVX2 isn't enabled in the target → abort
+#[cfg(all(feature = "enable-avx", target_arch = "x86_64", not(target_feature = "avx2")))]
+compile_error!("feature `enable-avx` requires AVX2. Build with RUSTFLAGS=\"-C target-feature=+avx2\".");
+
+// If the user enables this backend but FMA isn't enabled in the target → abort
+#[cfg(all(feature = "enable-avx", target_arch = "x86_64", not(target_feature = "fma")))]
+compile_error!("feature `enable-avx` requires FMA. Build with RUSTFLAGS=\"-C target-feature=+fma\".");
+
 mod module;
 mod reim;
 mod reim4;
--- a/poulpy-cpu-ref/README.md
+++ b/poulpy-cpu-ref/README.md
@@ -1,18 +1,93 @@
-# 🐙 Poulpy-CPU-AVX
+# 🐙 Poulpy-CPU-REF

-**Poulpy-Backend-CPU-AVX** is a Rust crate that provides the reference CPU implementation of **`poulpy-hal`**. This crate is used to instantiate projects implemented with **`poulpy-hal`**, **`poulpy-core`** and/or **`poulpy-schemes`**.
+**Poulpy-CPU-REF** is the **reference (portable) CPU backend for Poulpy**.

-## Example
+It implements the Poulpy HAL extension traits without requiring SIMD or specialized CPU instructions, making it suitable for:
+
+- all CPU architectures (`x86_64`, `aarch64`, `arm`, `riscv64`, …)
+- development machines and CI runners
+- environments without AVX or other advanced SIMD support
+
+This backend integrates transparently with:
+
+- `poulpy-hal`
+- `poulpy-core`
+- `poulpy-schemes`
+
+---
+
+## When is this backend used?
+
+`poulpy-cpu-ref` is always available and requires **no compilation flags and no CPU features**.
+
+It is automatically selected when:
+
+- the project does not request an optimized backend, or
+- the target CPU does not support the requested SIMD backend (e.g., AVX), or
+- portability and reproducibility are more important than raw performance.
+
+No additional configuration is required to use it.
+
+---
+
+## 🧪 Basic Usage

 ```rust
-use poulpy_backend_cpu_ref::FFT64Ref;
+use poulpy_cpu_ref::FFT64Ref;
 use poulpy_hal::{api::ModuleNew, layouts::Module};

 let log_n: usize = 10;
-let module = Module<FFT64Ref> = Module<FFT64Ref>::new(1<<log_n);
+let module: Module<FFT64Ref> = Module::<FFT64Ref>::new(1 << log_n);
 ```

-## Contributors
+This works on **all supported platforms and architectures**.

-To add your own backend, implement the open extension traits from **`poulpy-hal/oep`** for a struct that implements the `Backend` trait.  
-This will automatically make your backend compatible with the API of **`poulpy-hal`**, **`poulpy-core`** and **`poulpy-schemes`**.
+---
+
+## Performance Notes
+
+`poulpy-cpu-ref` prioritizes:
+
+* portability
+* correctness
+* ease of debugging
+
+For maximum performance on x86_64 CPUs with AVX2 + FMA support, consider enabling the optional optimized backend:
+
+```
+poulpy-cpu-avx (feature: enable-avx)
+```
+
+Benchmarks and applications can freely switch between backends without changing source code — backend selection can be handled with feature flags, for example
+
+```rust
+#[cfg(all(feature = "enable-avx", target_arch = "x86_64", target_feature = "avx2", target_feature = "fma"))]
+use poulpy_cpu_avx::FFT64Avx as BackendImpl;
+
+#[cfg(not(all(feature = "enable-avx", target_arch = "x86_64", target_feature = "avx2", target_feature = "fma")))]
+use poulpy_cpu_ref::FFT64Ref as BackendImpl;
+```
+
+---
+
+## 🤝 Contributors
+
+To implement your own backend (SIMD or accelerator):
+
+1. Define a backend struct
+2. Implement the open extension traits from `poulpy-hal/oep`
+3. Implement the `Backend` trait
+
+Your backend will automatically integrate with:
+
+* `poulpy-hal`
+* `poulpy-core`
+* `poulpy-schemes`
+
+No modifications to those crates are necessary — the HAL provides the extension points.
+
+---
+
+For questions or guidance, feel free to open an issue or discussion in the repository.
+
+```
--- a/poulpy-hal/src/reference/vec_znx/normalize.rs
+++ b/poulpy-hal/src/reference/vec_znx/normalize.rs
@@ -367,12 +367,8 @@ fn test_vec_znx_normalize_conv() {

            let out_prec: u32 = (end_size * end_base2k) as u32;

-            let mut data_want: Vec<Float> = (0..n)
-                .map(|_| Float::with_val(out_prec as u32, 0))
-                .collect();
-            let mut data_res: Vec<Float> = (0..n)
-                .map(|_| Float::with_val(out_prec as u32, 0))
-                .collect();
+            let mut data_want: Vec<Float> = (0..n).map(|_| Float::with_val(out_prec, 0)).collect();
+            let mut data_res: Vec<Float> = (0..n).map(|_| Float::with_val(out_prec, 0)).collect();

            have.decode_vec_float(end_base2k, 0, &mut data_want);
            want.decode_vec_float(end_base2k, 0, &mut data_res);
--- a/poulpy-schemes/Cargo.toml
+++ b/poulpy-schemes/Cargo.toml
@@ -9,6 +9,9 @@ repository = "https://github.com/phantomzone-org/poulpy"
 homepage = "https://github.com/phantomzone-org/poulpy"
 documentation = "https://docs.rs/poulpy"

+[features]
+enable-avx = []
+
 [dependencies]
 poulpy-cpu-avx = {workspace = true}
 poulpy-cpu-ref = {workspace = true}
--- a/poulpy-schemes/benches/circuit_bootstrapping.rs
+++ b/poulpy-schemes/benches/circuit_bootstrapping.rs
@@ -8,8 +8,13 @@ use poulpy_core::{
        GLWESecretPreparedFactory, LWE, LWELayout, LWESecret,
    },
 };
-use poulpy_cpu_avx::FFT64Avx;
-use poulpy_cpu_ref::FFT64Ref;
+
+#[cfg(all(feature = "enable-avx", target_arch = "x86_64"))]
+pub use poulpy_cpu_avx::FFT64Avx as BackendImpl;
+
+#[cfg(not(all(feature = "enable-avx", target_arch = "x86_64")))]
+pub use poulpy_cpu_ref::FFT64Ref as BackendImpl;
+
 use poulpy_hal::{
    api::{ModuleN, ModuleNew, ScratchOwnedAlloc, ScratchOwnedBorrow, VecZnxRotateInplace},
    layouts::{Backend, Module, Scratch, ScratchOwned},
@@ -127,7 +132,7 @@ where
        }
    }

-    for params in [Params {
+    let params: Params = Params {
        name: String::from("1-bit"),
        extension_factor: 1,
        k_pt: 1,
@@ -171,27 +176,22 @@ where
                rank: 2_u32.into(),
            },
        },
-    }] {
+    };
+
    let id: BenchmarkId = BenchmarkId::from_parameter(params.name.clone());
    let mut runner = runner::<BE, BRA>(&params);
    group.bench_with_input(id, &(), |b, _| b.iter(&mut runner));
-    }

    group.finish();
 }

-fn bench_circuit_bootstrapping_cpu_ref_fft64(c: &mut Criterion) {
-    benc_circuit_bootstrapping::<FFT64Ref, CGGI>(c, "fft64_ref");
+fn bench_circuit_bootstrapping_fft64(c: &mut Criterion) {
+    #[cfg(all(feature = "enable-avx", target_arch = "x86_64"))]
+    let label = "fft64_avx";
+    #[cfg(not(all(feature = "enable-avx", target_arch = "x86_64")))]
+    let label = "fft64_ref";
+    benc_circuit_bootstrapping::<BackendImpl, CGGI>(c, label);
 }

-fn bench_circuit_bootstrapping_cpu_avx_fft64(c: &mut Criterion) {
-    benc_circuit_bootstrapping::<FFT64Avx, CGGI>(c, "fft64_avx");
-}
-
-criterion_group!(
-    benches,
-    bench_circuit_bootstrapping_cpu_ref_fft64,
-    bench_circuit_bootstrapping_cpu_avx_fft64,
-);
-
+criterion_group!(benches, bench_circuit_bootstrapping_fft64);
 criterion_main!(benches);
--- a/poulpy-schemes/examples/circuit_bootstrapping.rs
+++ b/poulpy-schemes/examples/circuit_bootstrapping.rs
@@ -8,10 +8,10 @@ use poulpy_core::{
 };
 use std::time::Instant;

-#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
+#[cfg(all(feature = "enable-avx", target_arch = "x86_64", target_feature = "avx2", target_feature = "fma"))]
 use poulpy_cpu_avx::FFT64Avx as BackendImpl;

-#[cfg(not(any(target_arch = "x86_64", target_arch = "x86")))]
+#[cfg(not(all(feature = "enable-avx", target_arch = "x86_64", target_feature = "avx2", target_feature = "fma")))]
 use poulpy_cpu_ref::FFT64Ref as BackendImpl;

 use poulpy_hal::{
--- a/poulpy-schemes/src/bin_fhe/bdd_arithmetic/tests/fft64_avx.rs
+++ b/poulpy-schemes/src/bin_fhe/bdd_arithmetic/tests/fft64_avx.rs
@@ -5,7 +5,7 @@ use poulpy_cpu_avx::FFT64Avx;
 use crate::bin_fhe::{bdd_arithmetic::tests::test_suite, blind_rotation::CGGI};

 static TEST_CONTEXT_CGGI_FFT64_REF: LazyLock<test_suite::TestContext<CGGI, FFT64Avx>> =
-    LazyLock::new(|| test_suite::TestContext::<CGGI, FFT64Avx>::new());
+    LazyLock::new(test_suite::TestContext::<CGGI, FFT64Avx>::new);

 #[test]
 fn glwe_blind_retriever() {
--- a/poulpy-schemes/src/bin_fhe/bdd_arithmetic/tests/fft64_ref.rs
+++ b/poulpy-schemes/src/bin_fhe/bdd_arithmetic/tests/fft64_ref.rs
@@ -2,10 +2,10 @@ use std::sync::LazyLock;

 use poulpy_cpu_ref::FFT64Ref;

-use crate::tfhe::{bdd_arithmetic::tests::test_suite, blind_rotation::CGGI};
+use crate::bin_fhe::{bdd_arithmetic::tests::test_suite, blind_rotation::CGGI};

 static TEST_CONTEXT_CGGI_FFT64_REF: LazyLock<test_suite::TestContext<CGGI, FFT64Ref>> =
-    LazyLock::new(|| test_suite::TestContext::<CGGI, FFT64Ref>::new());
+    LazyLock::new(test_suite::TestContext::<CGGI, FFT64Ref>::new);

 #[test]
 fn glwe_blind_retriever() {
--- a/poulpy-schemes/src/bin_fhe/bdd_arithmetic/tests/mod.rs
+++ b/poulpy-schemes/src/bin_fhe/bdd_arithmetic/tests/mod.rs
@@ -1,9 +1,9 @@
 pub mod test_suite;

 #[cfg(test)]
-#[cfg(not(any(target_arch = "x86_64", target_arch = "x86")))]
+#[cfg(not(all(feature = "enable-avx", target_arch = "x86_64", target_feature = "avx2", target_feature = "fma")))]
 mod fft64_ref;

 #[cfg(test)]
-#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
+#[cfg(all(feature = "enable-avx", target_arch = "x86_64", target_feature = "avx2", target_feature = "fma"))]
 mod fft64_avx;
--- a/poulpy-schemes/src/bin_fhe/blind_rotation/tests/fft64_ref.rs
+++ b/poulpy-schemes/src/bin_fhe/blind_rotation/tests/fft64_ref.rs
@@ -1,7 +1,7 @@
 use poulpy_cpu_ref::FFT64Ref;
 use poulpy_hal::{api::ModuleNew, layouts::Module};

-use crate::tfhe::blind_rotation::{
+use crate::bin_fhe::blind_rotation::{
    CGGI,
    tests::test_suite::{
        generic_blind_rotation::test_blind_rotation,
--- a/poulpy-schemes/src/bin_fhe/blind_rotation/tests/mod.rs
+++ b/poulpy-schemes/src/bin_fhe/blind_rotation/tests/mod.rs
@@ -1,9 +1,9 @@
 #[cfg(test)]
-#[cfg(not(any(target_arch = "x86_64", target_arch = "x86")))]
+#[cfg(not(all(feature = "enable-avx", target_arch = "x86_64", target_feature = "avx2", target_feature = "fma")))]
 mod fft64_ref;

 #[cfg(test)]
-#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
+#[cfg(all(feature = "enable-avx", target_arch = "x86_64", target_feature = "avx2", target_feature = "fma"))]
 mod fft64_avx;

 #[cfg(test)]
--- a/poulpy-schemes/src/bin_fhe/circuit_bootstrapping/tests/mod.rs
+++ b/poulpy-schemes/src/bin_fhe/circuit_bootstrapping/tests/mod.rs
@@ -1,9 +1,9 @@
 pub mod circuit_bootstrapping;

 #[cfg(test)]
-#[cfg(not(any(target_arch = "x86_64", target_arch = "x86")))]
+#[cfg(not(all(feature = "enable-avx", target_arch = "x86_64", target_feature = "avx2", target_feature = "fma")))]
 mod fft64_ref;

 #[cfg(test)]
-#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
+#[cfg(all(feature = "enable-avx", target_arch = "x86_64", target_feature = "avx2", target_feature = "fma"))]
 mod fft64_avx;