spqlios basic wrapper

2026-02-10 13:16:44 +01:00 · 2025-01-26 12:26:44 +01:00
parent 7e9a9501b5
commit 06e4e58b2d
201 changed files with 30406 additions and 3 deletions
--- a/spqlios/lib/test/CMakeLists.txt
+++ b/spqlios/lib/test/CMakeLists.txt
@@ -0,0 +1,142 @@
+set(CMAKE_CXX_STANDARD 17)
+
+set(test_incs ..)
+set(gtest_libs)
+set(benchmark_libs)
+# searching for libgtest
+find_path(gtest_inc NAMES gtest/gtest.h)
+find_library(gtest NAMES gtest)
+find_library(gtest_main REQUIRED NAMES gtest_main)
+if (gtest_inc AND gtest AND gtest_main)
+    message(STATUS "Found gtest: I=${gtest_inc} L=${gtest},${gtest_main}")
+    set(test_incs ${test_incs} ${gtest_inc})
+    set(gtest_libs ${gtest_libs} ${gtest} ${gtest_main} pthread)
+else()
+    message(FATAL_ERROR "Libgtest not found (required if ENABLE_TESTING is on): I=${gtest_inc} L=${gtest},${gtest_main}")
+endif()
+# searching for libbenchmark
+find_path(benchmark_inc NAMES benchmark/benchmark.h)
+find_library(benchmark NAMES benchmark)
+if (benchmark_inc AND benchmark)
+    message(STATUS "Found benchmark: I=${benchmark_inc} L=${benchmark}")
+    set(test_incs ${test_incs} ${benchmark_inc})
+    set(benchmark_libs ${benchmark_libs} ${benchmark})
+else()
+    message(FATAL_ERROR "Libbenchmark not found (required if ENABLE_TESTING is on): I=${benchmark_inc} L=${benchmark}")
+endif()
+find_path(VALGRIND_DIR NAMES valgrind/valgrind.h)
+if (VALGRIND_DIR)
+    message(STATUS "Found valgrind header ${VALGRIND_DIR}")
+else ()
+    # for now, we will fail if we don't find valgrind for tests
+    message(STATUS "CANNOT FIND valgrind header: ${VALGRIND_DIR}")
+endif ()
+
+add_library(spqlios-testlib SHARED
+        testlib/random.cpp
+        testlib/test_commons.h
+        testlib/test_commons.cpp
+        testlib/mod_q120.h
+        testlib/mod_q120.cpp
+        testlib/negacyclic_polynomial.cpp
+        testlib/negacyclic_polynomial.h
+        testlib/negacyclic_polynomial_impl.h
+        testlib/reim4_elem.cpp
+        testlib/reim4_elem.h
+        testlib/fft64_dft.cpp
+        testlib/fft64_dft.h
+        testlib/fft64_layouts.h
+        testlib/fft64_layouts.cpp
+        testlib/ntt120_layouts.cpp
+        testlib/ntt120_layouts.h
+        testlib/ntt120_dft.cpp
+        testlib/ntt120_dft.h
+        testlib/test_hash.cpp
+        testlib/sha3.h
+        testlib/sha3.c
+        testlib/polynomial_vector.h
+        testlib/polynomial_vector.cpp
+        testlib/vec_rnx_layout.h
+        testlib/vec_rnx_layout.cpp
+        testlib/zn_layouts.h
+        testlib/zn_layouts.cpp
+)
+if (VALGRIND_DIR)
+    target_include_directories(spqlios-testlib PRIVATE ${VALGRIND_DIR})
+    target_compile_definitions(spqlios-testlib PRIVATE VALGRIND_MEM_TESTS)
+endif ()
+target_link_libraries(spqlios-testlib libspqlios)
+
+
+
+# main unittest file
+message(STATUS "${gtest_libs}")
+set(UNITTEST_FILES
+        spqlios_test.cpp
+        spqlios_reim_conversions_test.cpp
+        spqlios_reim_test.cpp
+        spqlios_reim4_arithmetic_test.cpp
+        spqlios_cplx_test.cpp
+        spqlios_cplx_conversions_test.cpp
+        spqlios_q120_ntt_test.cpp
+        spqlios_q120_arithmetic_test.cpp
+        spqlios_coeffs_arithmetic_test.cpp
+        spqlios_vec_znx_big_test.cpp
+        spqlios_znx_small_test.cpp
+        spqlios_vmp_product_test.cpp
+        spqlios_vec_znx_dft_test.cpp
+        spqlios_svp_test.cpp
+        spqlios_svp_product_test.cpp
+        spqlios_vec_znx_test.cpp
+        spqlios_vec_rnx_test.cpp
+        spqlios_vec_rnx_vmp_test.cpp
+        spqlios_vec_rnx_conversions_test.cpp
+        spqlios_vec_rnx_ppol_test.cpp
+        spqlios_vec_rnx_approxdecomp_tnxdbl_test.cpp
+        spqlios_zn_approxdecomp_test.cpp
+        spqlios_zn_conversions_test.cpp
+        spqlios_zn_vmp_test.cpp
+        
+
+)
+
+add_executable(spqlios-test ${UNITTEST_FILES})
+target_link_libraries(spqlios-test spqlios-testlib libspqlios ${gtest_libs})
+target_include_directories(spqlios-test PRIVATE ${test_incs})
+add_test(NAME spqlios-test COMMAND spqlios-test)
+if (WIN32)
+    # copy the dlls to the test directory
+    cmake_minimum_required(VERSION 3.26)
+    add_custom_command(
+            POST_BUILD
+            TARGET spqlios-test
+            COMMAND ${CMAKE_COMMAND} -E copy
+            -t $<TARGET_FILE_DIR:spqlios-test> $<TARGET_RUNTIME_DLLS:spqlios-testlib> $<TARGET_RUNTIME_DLLS:libspqlios>
+            COMMAND_EXPAND_LISTS
+    )
+endif()
+
+# benchmarks
+add_executable(spqlios-cplx-fft-bench spqlios_cplx_fft_bench.cpp)
+target_link_libraries(spqlios-cplx-fft-bench libspqlios ${benchmark_libs} pthread)
+target_include_directories(spqlios-cplx-fft-bench PRIVATE ${test_incs})
+
+if (X86 OR X86_WIN32)
+    add_executable(spqlios-q120-ntt-bench spqlios_q120_ntt_bench.cpp)
+    target_link_libraries(spqlios-q120-ntt-bench libspqlios ${benchmark_libs} pthread)
+    target_include_directories(spqlios-q120-ntt-bench PRIVATE ${test_incs})
+
+    add_executable(spqlios-q120-arithmetic-bench spqlios_q120_arithmetic_bench.cpp)
+    target_link_libraries(spqlios-q120-arithmetic-bench libspqlios  ${benchmark_libs} pthread)
+    target_include_directories(spqlios-q120-arithmetic-bench PRIVATE ${test_incs})
+endif ()
+
+if (X86 OR X86_WIN32)
+    add_executable(spqlios_reim4_arithmetic_bench spqlios_reim4_arithmetic_bench.cpp)
+    target_link_libraries(spqlios_reim4_arithmetic_bench ${benchmark_libs} libspqlios pthread)
+    target_include_directories(spqlios_reim4_arithmetic_bench PRIVATE ${test_incs})
+endif ()
+
+if (DEVMODE_INSTALL)
+    install(TARGETS spqlios-testlib)
+endif()
--- a/spqlios/lib/test/spqlios_coeffs_arithmetic_test.cpp
+++ b/spqlios/lib/test/spqlios_coeffs_arithmetic_test.cpp
@@ -0,0 +1,488 @@
+#include <gtest/gtest.h>
+#include <sys/types.h>
+
+#include <cstdint>
+#include <limits>
+#include <random>
+#include <vector>
+
+#include "../spqlios/coeffs/coeffs_arithmetic.h"
+#include "test/testlib/mod_q120.h"
+#include "testlib/negacyclic_polynomial.h"
+#include "testlib/test_commons.h"
+
+/// tests of element-wise operations
+template <typename T, typename F, typename G>
+void test_elemw_op(F elemw_op, G poly_elemw_op) {
+  for (uint64_t n : {1, 2, 4, 8, 16, 64, 256, 4096}) {
+    polynomial<T> a = polynomial<T>::random(n);
+    polynomial<T> b = polynomial<T>::random(n);
+    polynomial<T> expect(n);
+    polynomial<T> actual(n);
+    // out of place
+    expect = poly_elemw_op(a, b);
+    elemw_op(n, actual.data(), a.data(), b.data());
+    ASSERT_EQ(actual, expect);
+    // in place 1
+    actual = polynomial<T>::random(n);
+    expect = poly_elemw_op(actual, b);
+    elemw_op(n, actual.data(), actual.data(), b.data());
+    ASSERT_EQ(actual, expect);
+    // in place 2
+    actual = polynomial<T>::random(n);
+    expect = poly_elemw_op(a, actual);
+    elemw_op(n, actual.data(), a.data(), actual.data());
+    ASSERT_EQ(actual, expect);
+    // in place 3
+    actual = polynomial<T>::random(n);
+    expect = poly_elemw_op(actual, actual);
+    elemw_op(n, actual.data(), actual.data(), actual.data());
+    ASSERT_EQ(actual, expect);
+  }
+}
+
+static polynomial<int64_t> poly_i64_add(const polynomial<int64_t>& u, polynomial<int64_t>& v) { return u + v; }
+static polynomial<int64_t> poly_i64_sub(const polynomial<int64_t>& u, polynomial<int64_t>& v) { return u - v; }
+TEST(coeffs_arithmetic, znx_add_i64_ref) { test_elemw_op<int64_t>(znx_add_i64_ref, poly_i64_add); }
+TEST(coeffs_arithmetic, znx_sub_i64_ref) { test_elemw_op<int64_t>(znx_sub_i64_ref, poly_i64_sub); }
+#ifdef __x86_64__
+TEST(coeffs_arithmetic, znx_add_i64_avx) { test_elemw_op<int64_t>(znx_add_i64_avx, poly_i64_add); }
+TEST(coeffs_arithmetic, znx_sub_i64_avx) { test_elemw_op<int64_t>(znx_sub_i64_avx, poly_i64_sub); }
+#endif
+
+/// tests of element-wise operations
+template <typename T, typename F, typename G>
+void test_elemw_unary_op(F elemw_op, G poly_elemw_op) {
+  for (uint64_t n : {1, 2, 4, 8, 16, 64, 256, 4096}) {
+    polynomial<T> a = polynomial<T>::random(n);
+    polynomial<T> expect(n);
+    polynomial<T> actual(n);
+    // out of place
+    expect = poly_elemw_op(a);
+    elemw_op(n, actual.data(), a.data());
+    ASSERT_EQ(actual, expect);
+    // in place
+    actual = polynomial<T>::random(n);
+    expect = poly_elemw_op(actual);
+    elemw_op(n, actual.data(), actual.data());
+    ASSERT_EQ(actual, expect);
+  }
+}
+
+static polynomial<int64_t> poly_i64_neg(const polynomial<int64_t>& u) { return -u; }
+static polynomial<int64_t> poly_i64_copy(const polynomial<int64_t>& u) { return u; }
+TEST(coeffs_arithmetic, znx_neg_i64_ref) { test_elemw_unary_op<int64_t>(znx_negate_i64_ref, poly_i64_neg); }
+TEST(coeffs_arithmetic, znx_copy_i64_ref) { test_elemw_unary_op<int64_t>(znx_copy_i64_ref, poly_i64_copy); }
+#ifdef __x86_64__
+TEST(coeffs_arithmetic, znx_neg_i64_avx) { test_elemw_unary_op<int64_t>(znx_negate_i64_avx, poly_i64_neg); }
+#endif
+
+/// tests of the rotations out of place
+template <typename T, typename F>
+void test_rotation_outplace(F rotate) {
+  for (uint64_t n : {1, 2, 4, 8, 16, 64, 256, 4096}) {
+    polynomial<T> poly = polynomial<T>::random(n);
+    polynomial<T> expect(n);
+    polynomial<T> actual(n);
+    for (uint64_t trial = 0; trial < 10; ++trial) {
+      int64_t p = uniform_i64_bits(32);
+      // rotate by p
+      for (uint64_t i = 0; i < n; ++i) {
+        expect.set_coeff(i, poly.get_coeff(i - p));
+      }
+      // rotate using the function
+      rotate(n, p, actual.data(), poly.data());
+      ASSERT_EQ(actual, expect);
+    }
+  }
+}
+
+TEST(coeffs_arithmetic, rnx_rotate_f64) { test_rotation_outplace<double>(rnx_rotate_f64); }
+TEST(coeffs_arithmetic, znx_rotate_i64) { test_rotation_outplace<int64_t>(znx_rotate_i64); }
+
+/// tests of the rotations out of place
+template <typename T, typename F>
+void test_rotation_inplace(F rotate) {
+  for (uint64_t n : {1, 2, 4, 8, 16, 64, 256, 4096}) {
+    polynomial<T> poly = polynomial<T>::random(n);
+    polynomial<T> expect(n);
+    for (uint64_t trial = 0; trial < 10; ++trial) {
+      polynomial<T> actual = poly;
+      int64_t p = uniform_i64_bits(32);
+      // rotate by p
+      for (uint64_t i = 0; i < n; ++i) {
+        expect.set_coeff(i, poly.get_coeff(i - p));
+      }
+      // rotate using the function
+      rotate(n, p, actual.data());
+      ASSERT_EQ(actual, expect);
+    }
+  }
+}
+
+TEST(coeffs_arithmetic, rnx_rotate_inplace_f64) { test_rotation_inplace<double>(rnx_rotate_inplace_f64); }
+
+TEST(coeffs_arithmetic, znx_rotate_inplace_i64) { test_rotation_inplace<int64_t>(znx_rotate_inplace_i64); }
+
+/// tests of the rotations out of place
+template <typename T, typename F>
+void test_mul_xp_minus_one_outplace(F rotate) {
+  for (uint64_t n : {1, 2, 4, 8, 16, 64, 256, 4096}) {
+    polynomial<T> poly = polynomial<T>::random(n);
+    polynomial<T> expect(n);
+    polynomial<T> actual(n);
+    for (uint64_t trial = 0; trial < 10; ++trial) {
+      int64_t p = uniform_i64_bits(32);
+      // rotate by p
+      for (uint64_t i = 0; i < n; ++i) {
+        expect.set_coeff(i, poly.get_coeff(i - p) - poly.get_coeff(i));
+      }
+      // rotate using the function
+      rotate(n, p, actual.data(), poly.data());
+      ASSERT_EQ(actual, expect);
+    }
+  }
+}
+
+TEST(coeffs_arithmetic, rnx_mul_xp_minus_one_f64) { test_mul_xp_minus_one_outplace<double>(rnx_mul_xp_minus_one); }
+TEST(coeffs_arithmetic, znx_mul_xp_minus_one_i64) { test_mul_xp_minus_one_outplace<int64_t>(znx_mul_xp_minus_one); }
+
+/// tests of the rotations out of place
+template <typename T, typename F>
+void test_mul_xp_minus_one_inplace(F rotate) {
+  for (uint64_t n : {1, 2, 4, 8, 16, 64, 256, 4096}) {
+    polynomial<T> poly = polynomial<T>::random(n);
+    polynomial<T> expect(n);
+    for (uint64_t trial = 0; trial < 10; ++trial) {
+      polynomial<T> actual = poly;
+      int64_t p = uniform_i64_bits(32);
+      // rotate by p
+      for (uint64_t i = 0; i < n; ++i) {
+        expect.set_coeff(i, poly.get_coeff(i - p) - poly.get_coeff(i));
+      }
+      // rotate using the function
+      rotate(n, p, actual.data());
+      ASSERT_EQ(actual, expect);
+    }
+  }
+}
+
+TEST(coeffs_arithmetic, rnx_mul_xp_minus_one_inplace_f64) {
+  test_mul_xp_minus_one_inplace<double>(rnx_mul_xp_minus_one_inplace);
+}
+
+// TEST(coeffs_arithmetic, znx_mul_xp_minus_one_inplace_i64) {
+// test_mul_xp_minus_one_inplace<int64_t>(znx_rotate_inplace_i64); }
+/// tests of the automorphisms out of place
+template <typename T, typename F>
+void test_automorphism_outplace(F automorphism) {
+  for (uint64_t n : {1, 2, 4, 8, 16, 64, 256, 4096}) {
+    polynomial<T> poly = polynomial<T>::random(n);
+    polynomial<T> expect(n);
+    polynomial<T> actual(n);
+    for (uint64_t trial = 0; trial < 10; ++trial) {
+      int64_t p = uniform_i64_bits(32) | int64_t(1);  // make it odd
+      // automorphism p
+      for (uint64_t i = 0; i < n; ++i) {
+        expect.set_coeff(i * p, poly.get_coeff(i));
+      }
+      // rotate using the function
+      automorphism(n, p, actual.data(), poly.data());
+      ASSERT_EQ(actual, expect);
+    }
+  }
+}
+
+TEST(coeffs_arithmetic, rnx_automorphism_f64) { test_automorphism_outplace<double>(rnx_automorphism_f64); }
+TEST(coeffs_arithmetic, znx_automorphism_i64) { test_automorphism_outplace<int64_t>(znx_automorphism_i64); }
+
+/// tests of the automorphisms out of place
+template <typename T, typename F>
+void test_automorphism_inplace(F automorphism) {
+  for (uint64_t n : {1, 2, 4, 8, 16, 64, 256, 4096, 16384}) {
+    polynomial<T> poly = polynomial<T>::random(n);
+    polynomial<T> expect(n);
+    for (uint64_t trial = 0; trial < 20; ++trial) {
+      polynomial<T> actual = poly;
+      int64_t p = uniform_i64_bits(32) | int64_t(1);  // make it odd
+      // automorphism p
+      for (uint64_t i = 0; i < n; ++i) {
+        expect.set_coeff(i * p, poly.get_coeff(i));
+      }
+      automorphism(n, p, actual.data());
+      if (!(actual == expect)) {
+        std::cerr << "automorphism p: " << p << std::endl;
+        for (uint64_t i = 0; i < n; ++i) {
+          std::cerr << i << " " << actual.get_coeff(i) << " vs " << expect.get_coeff(i) << " "
+                    << (actual.get_coeff(i) == expect.get_coeff(i)) << std::endl;
+        }
+      }
+      ASSERT_EQ(actual, expect);
+    }
+  }
+}
+TEST(coeffs_arithmetic, rnx_automorphism_inplace_f64) {
+  test_automorphism_inplace<double>(rnx_automorphism_inplace_f64);
+}
+TEST(coeffs_arithmetic, znx_automorphism_inplace_i64) {
+  test_automorphism_inplace<int64_t>(znx_automorphism_inplace_i64);
+}
+
+// TODO: write a test later!
+
+/**
+ * @brief res = (X^p-1).in
+ * @param nn the ring dimension
+ * @param p must be between -2nn <= p <= 2nn
+ * @param in is a rnx/znx vector of dimension nn
+ */
+EXPORT void rnx_mul_xp_minus_one(uint64_t nn, int64_t p, double* res, const double* in);
+EXPORT void znx_mul_xp_minus_one(uint64_t nn, int64_t p, int64_t* res, const int64_t* in);
+
+// normalize with no carry in nor carry out
+template <uint8_t inplace_flag, typename F>
+void test_znx_normalize(F normalize) {
+  for (const uint64_t n : {1, 2, 4, 8, 16, 64, 256, 4096}) {
+    polynomial<int64_t> inp = znx_i64::random_log2bound(n, 62);
+    if (n >= 2) {
+      inp.set_coeff(0, -(INT64_C(1) << 62));
+      inp.set_coeff(1, (INT64_C(1) << 62));
+    }
+    for (const uint64_t base_k : {2, 3, 19, 35, 62}) {
+      polynomial<int64_t> out;
+      int64_t* inp_ptr;
+      if (inplace_flag == 1) {
+        out = polynomial<int64_t>(inp);
+        inp_ptr = out.data();
+      } else {
+        out = polynomial<int64_t>(n);
+        inp_ptr = inp.data();
+      }
+
+      znx_normalize(n, base_k, out.data(), nullptr, inp_ptr, nullptr);
+      for (uint64_t i = 0; i < n; ++i) {
+        const int64_t x = inp.get_coeff(i);
+        const int64_t y = out.get_coeff(i);
+        const int64_t y_exp = centermod(x, INT64_C(1) << base_k);
+        ASSERT_EQ(y, y_exp) << n << " " << base_k << " " << i << " " << x << " " << y;
+      }
+    }
+  }
+}
+
+TEST(coeffs_arithmetic, znx_normalize_outplace) { test_znx_normalize<0>(znx_normalize); }
+TEST(coeffs_arithmetic, znx_normalize_inplace) { test_znx_normalize<1>(znx_normalize); }
+
+// normalize with no carry in nor carry out
+template <uint8_t inplace_flag, bool has_output, typename F>
+void test_znx_normalize_cout(F normalize) {
+  static_assert(inplace_flag < 3, "either out or cout can be inplace with inp");
+  for (const uint64_t n : {1, 2, 4, 8, 16, 64, 256, 4096}) {
+    polynomial<int64_t> inp = znx_i64::random_log2bound(n, 62);
+    if (n >= 2) {
+      inp.set_coeff(0, -(INT64_C(1) << 62));
+      inp.set_coeff(1, (INT64_C(1) << 62));
+    }
+    for (const uint64_t base_k : {2, 3, 19, 35, 62}) {
+      polynomial<int64_t> out, cout;
+      int64_t* inp_ptr;
+      if (inplace_flag == 1) {
+        // out and inp are the same
+        out = polynomial<int64_t>(inp);
+        inp_ptr = out.data();
+        cout = polynomial<int64_t>(n);
+      } else if (inplace_flag == 2) {
+        // carry out and inp are the same
+        cout = polynomial<int64_t>(inp);
+        inp_ptr = cout.data();
+        out = polynomial<int64_t>(n);
+      } else {
+        // inp, out and carry out are distinct
+        out = polynomial<int64_t>(n);
+        cout = polynomial<int64_t>(n);
+        inp_ptr = inp.data();
+      }
+
+      znx_normalize(n, base_k, has_output ? out.data() : nullptr, cout.data(), inp_ptr, nullptr);
+      for (uint64_t i = 0; i < n; ++i) {
+        const int64_t x = inp.get_coeff(i);
+        const int64_t co = cout.get_coeff(i);
+        const int64_t y_exp = centermod((int64_t)x, INT64_C(1) << base_k);
+        const int64_t co_exp = (x - y_exp) >> base_k;
+        ASSERT_EQ(co, co_exp);
+
+        if (has_output) {
+          const int64_t y = out.get_coeff(i);
+          ASSERT_EQ(y, y_exp);
+        }
+      }
+    }
+  }
+}
+
+TEST(coeffs_arithmetic, znx_normalize_cout_outplace) { test_znx_normalize_cout<0, false>(znx_normalize); }
+TEST(coeffs_arithmetic, znx_normalize_out_cout_outplace) { test_znx_normalize_cout<0, true>(znx_normalize); }
+TEST(coeffs_arithmetic, znx_normalize_cout_inplace1) { test_znx_normalize_cout<1, false>(znx_normalize); }
+TEST(coeffs_arithmetic, znx_normalize_out_cout_inplace1) { test_znx_normalize_cout<1, true>(znx_normalize); }
+TEST(coeffs_arithmetic, znx_normalize_cout_inplace2) { test_znx_normalize_cout<2, false>(znx_normalize); }
+TEST(coeffs_arithmetic, znx_normalize_out_cout_inplace2) { test_znx_normalize_cout<2, true>(znx_normalize); }
+
+// normalize with no carry in nor carry out
+template <uint8_t inplace_flag, typename F>
+void test_znx_normalize_cin(F normalize) {
+  static_assert(inplace_flag < 3, "either inp or cin can be inplace with out");
+  for (const uint64_t n : {1, 2, 4, 8, 16, 64, 256, 4096}) {
+    polynomial<int64_t> inp = znx_i64::random_log2bound(n, 62);
+    if (n >= 4) {
+      inp.set_coeff(0, -(INT64_C(1) << 62));
+      inp.set_coeff(1, -(INT64_C(1) << 62));
+      inp.set_coeff(2, (INT64_C(1) << 62));
+      inp.set_coeff(3, (INT64_C(1) << 62));
+    }
+    for (const uint64_t base_k : {2, 3, 19, 35, 62}) {
+      polynomial<int64_t> cin = znx_i64::random_log2bound(n, 62);
+      if (n >= 4) {
+        inp.set_coeff(0, -(INT64_C(1) << 62));
+        inp.set_coeff(1, (INT64_C(1) << 62));
+        inp.set_coeff(0, -(INT64_C(1) << 62));
+        inp.set_coeff(1, (INT64_C(1) << 62));
+      }
+
+      polynomial<int64_t> out;
+      int64_t *inp_ptr, *cin_ptr;
+      if (inplace_flag == 1) {
+        // out and inp are the same
+        out = polynomial<int64_t>(inp);
+        inp_ptr = out.data();
+        cin_ptr = cin.data();
+      } else if (inplace_flag == 2) {
+        // out and carry in are the same
+        out = polynomial<int64_t>(cin);
+        inp_ptr = inp.data();
+        cin_ptr = out.data();
+      } else {
+        // inp, carry in and out are distinct
+        out = polynomial<int64_t>(n);
+        inp_ptr = inp.data();
+        cin_ptr = cin.data();
+      }
+
+      znx_normalize(n, base_k, out.data(), nullptr, inp_ptr, cin_ptr);
+      for (uint64_t i = 0; i < n; ++i) {
+        const int64_t x = inp.get_coeff(i);
+        const int64_t ci = cin.get_coeff(i);
+        const int64_t y = out.get_coeff(i);
+
+        const __int128_t xp = (__int128_t)x + ci;
+        const int64_t y_exp = centermod((int64_t)xp, INT64_C(1) << base_k);
+
+        ASSERT_EQ(y, y_exp) << n << " " << base_k << " " << i << " " << x << " " << y << " " << ci;
+      }
+    }
+  }
+}
+
+TEST(coeffs_arithmetic, znx_normalize_cin_outplace) { test_znx_normalize_cin<0>(znx_normalize); }
+TEST(coeffs_arithmetic, znx_normalize_cin_inplace1) { test_znx_normalize_cin<1>(znx_normalize); }
+TEST(coeffs_arithmetic, znx_normalize_cin_inplace2) { test_znx_normalize_cin<2>(znx_normalize); }
+
+// normalize with no carry in nor carry out
+template <uint8_t inplace_flag, bool has_output, typename F>
+void test_znx_normalize_cin_cout(F normalize) {
+  static_assert(inplace_flag < 7, "either inp or cin can be inplace with out");
+  for (const uint64_t n : {1, 2, 4, 8, 16, 64, 256, 4096}) {
+    polynomial<int64_t> inp = znx_i64::random_log2bound(n, 62);
+    if (n >= 4) {
+      inp.set_coeff(0, -(INT64_C(1) << 62));
+      inp.set_coeff(1, -(INT64_C(1) << 62));
+      inp.set_coeff(2, (INT64_C(1) << 62));
+      inp.set_coeff(3, (INT64_C(1) << 62));
+    }
+    for (const uint64_t base_k : {2, 3, 19, 35, 62}) {
+      polynomial<int64_t> cin = znx_i64::random_log2bound(n, 62);
+      if (n >= 4) {
+        inp.set_coeff(0, -(INT64_C(1) << 62));
+        inp.set_coeff(1, (INT64_C(1) << 62));
+        inp.set_coeff(0, -(INT64_C(1) << 62));
+        inp.set_coeff(1, (INT64_C(1) << 62));
+      }
+
+      polynomial<int64_t> out, cout;
+      int64_t *inp_ptr, *cin_ptr;
+      if (inplace_flag == 1) {
+        // out == inp
+        out = polynomial<int64_t>(inp);
+        cout = polynomial<int64_t>(n);
+        inp_ptr = out.data();
+        cin_ptr = cin.data();
+      } else if (inplace_flag == 2) {
+        // cout == inp
+        out = polynomial<int64_t>(n);
+        cout = polynomial<int64_t>(inp);
+        inp_ptr = cout.data();
+        cin_ptr = cin.data();
+      } else if (inplace_flag == 3) {
+        // out == cin
+        out = polynomial<int64_t>(cin);
+        cout = polynomial<int64_t>(n);
+        inp_ptr = inp.data();
+        cin_ptr = out.data();
+      } else if (inplace_flag == 4) {
+        // cout == cin
+        out = polynomial<int64_t>(n);
+        cout = polynomial<int64_t>(cin);
+        inp_ptr = inp.data();
+        cin_ptr = cout.data();
+      } else if (inplace_flag == 5) {
+        // out == inp, cout == cin
+        out = polynomial<int64_t>(inp);
+        cout = polynomial<int64_t>(cin);
+        inp_ptr = out.data();
+        cin_ptr = cout.data();
+      } else if (inplace_flag == 6) {
+        // out == cin, cout == inp
+        out = polynomial<int64_t>(cin);
+        cout = polynomial<int64_t>(inp);
+        inp_ptr = cout.data();
+        cin_ptr = out.data();
+      } else {
+        out = polynomial<int64_t>(n);
+        cout = polynomial<int64_t>(n);
+        inp_ptr = inp.data();
+        cin_ptr = cin.data();
+      }
+
+      znx_normalize(n, base_k, has_output ? out.data() : nullptr, cout.data(), inp_ptr, cin_ptr);
+      for (uint64_t i = 0; i < n; ++i) {
+        const int64_t x = inp.get_coeff(i);
+        const int64_t ci = cin.get_coeff(i);
+        const int64_t co = cout.get_coeff(i);
+
+        const __int128_t xp = (__int128_t)x + ci;
+        const int64_t y_exp = centermod((int64_t)xp, INT64_C(1) << base_k);
+        const int64_t co_exp = (xp - y_exp) >> base_k;
+        ASSERT_EQ(co, co_exp);
+
+        if (has_output) {
+          const int64_t y = out.get_coeff(i);
+          ASSERT_EQ(y, y_exp);
+        }
+      }
+    }
+  }
+}
+
+TEST(coeffs_arithmetic, znx_normalize_cin_cout_outplace) { test_znx_normalize_cin_cout<0, false>(znx_normalize); }
+TEST(coeffs_arithmetic, znx_normalize_out_cin_cout_outplace) { test_znx_normalize_cin_cout<0, true>(znx_normalize); }
+TEST(coeffs_arithmetic, znx_normalize_cin_cout_inplace1) { test_znx_normalize_cin_cout<1, false>(znx_normalize); }
+TEST(coeffs_arithmetic, znx_normalize_out_cin_cout_inplace1) { test_znx_normalize_cin_cout<1, true>(znx_normalize); }
+TEST(coeffs_arithmetic, znx_normalize_cin_cout_inplace2) { test_znx_normalize_cin_cout<2, false>(znx_normalize); }
+TEST(coeffs_arithmetic, znx_normalize_out_cin_cout_inplace2) { test_znx_normalize_cin_cout<2, true>(znx_normalize); }
+TEST(coeffs_arithmetic, znx_normalize_cin_cout_inplace3) { test_znx_normalize_cin_cout<3, false>(znx_normalize); }
+TEST(coeffs_arithmetic, znx_normalize_out_cin_cout_inplace3) { test_znx_normalize_cin_cout<3, true>(znx_normalize); }
+TEST(coeffs_arithmetic, znx_normalize_cin_cout_inplace4) { test_znx_normalize_cin_cout<4, false>(znx_normalize); }
+TEST(coeffs_arithmetic, znx_normalize_out_cin_cout_inplace4) { test_znx_normalize_cin_cout<4, true>(znx_normalize); }
+TEST(coeffs_arithmetic, znx_normalize_cin_cout_inplace5) { test_znx_normalize_cin_cout<5, false>(znx_normalize); }
+TEST(coeffs_arithmetic, znx_normalize_out_cin_cout_inplace5) { test_znx_normalize_cin_cout<5, true>(znx_normalize); }
+TEST(coeffs_arithmetic, znx_normalize_cin_cout_inplace6) { test_znx_normalize_cin_cout<6, false>(znx_normalize); }
+TEST(coeffs_arithmetic, znx_normalize_out_cin_cout_inplace6) { test_znx_normalize_cin_cout<6, true>(znx_normalize); }
--- a/spqlios/lib/test/spqlios_cplx_conversions_test.cpp
+++ b/spqlios/lib/test/spqlios_cplx_conversions_test.cpp
@@ -0,0 +1,86 @@
+#include <gtest/gtest.h>
+
+#include <cmath>
+
+#include "spqlios/cplx/cplx_fft_internal.h"
+#include "spqlios/cplx/cplx_fft_private.h"
+
+#ifdef __x86_64__
+TEST(fft, cplx_from_znx32_ref_vs_fma) {
+  const uint32_t m = 128;
+  int32_t* src = (int32_t*)spqlios_alloc_custom_align(32, 10 * m * sizeof(int32_t));
+  CPLX* dst1 = (CPLX*)(src + 2 * m);
+  CPLX* dst2 = (CPLX*)(src + 6 * m);
+  for (uint64_t i = 0; i < 2 * m; ++i) {
+    src[i] = rand() - RAND_MAX / 2;
+  }
+  CPLX_FROM_ZNX32_PRECOMP precomp;
+  precomp.m = m;
+  cplx_from_znx32_ref(&precomp, dst1, src);
+  // cplx_from_znx32_simple(m, 32, dst1, src);
+  cplx_from_znx32_avx2_fma(&precomp, dst2, src);
+  for (uint64_t i = 0; i < m; ++i) {
+    ASSERT_EQ(dst1[i][0], dst2[i][0]);
+    ASSERT_EQ(dst1[i][1], dst2[i][1]);
+  }
+  spqlios_free(src);
+}
+#endif
+
+#ifdef __x86_64__
+TEST(fft, cplx_from_tnx32_ref_vs_fma) {
+  const uint32_t m = 128;
+  int32_t* src = (int32_t*)spqlios_alloc_custom_align(32, 10 * m * sizeof(int32_t));
+  CPLX* dst1 = (CPLX*)(src + 2 * m);
+  CPLX* dst2 = (CPLX*)(src + 6 * m);
+  for (uint64_t i = 0; i < 2 * m; ++i) {
+    src[i] = rand() + (rand() << 20);
+  }
+  CPLX_FROM_TNX32_PRECOMP precomp;
+  precomp.m = m;
+  cplx_from_tnx32_ref(&precomp, dst1, src);
+  // cplx_from_tnx32_simple(m, dst1, src);
+  cplx_from_tnx32_avx2_fma(&precomp, dst2, src);
+  for (uint64_t i = 0; i < m; ++i) {
+    ASSERT_EQ(dst1[i][0], dst2[i][0]);
+    ASSERT_EQ(dst1[i][1], dst2[i][1]);
+  }
+  spqlios_free(src);
+}
+#endif
+
+#ifdef __x86_64__
+TEST(fft, cplx_to_tnx32_ref_vs_fma) {
+  for (const uint32_t m : {8, 128, 1024, 65536}) {
+    for (const double divisor : {double(1), double(m), double(0.5)}) {
+      CPLX* src = (CPLX*)spqlios_alloc_custom_align(32, 10 * m * sizeof(int32_t));
+      int32_t* dst1 = (int32_t*)(src + m);
+      int32_t* dst2 = (int32_t*)(src + 2 * m);
+      for (uint64_t i = 0; i < 2 * m; ++i) {
+        src[i][0] = (rand() / double(RAND_MAX) - 0.5) * pow(2., 19 - (rand() % 60)) * divisor;
+        src[i][1] = (rand() / double(RAND_MAX) - 0.5) * pow(2., 19 - (rand() % 60)) * divisor;
+      }
+      CPLX_TO_TNX32_PRECOMP precomp;
+      precomp.m = m;
+      precomp.divisor = divisor;
+      cplx_to_tnx32_ref(&precomp, dst1, src);
+      cplx_to_tnx32_avx2_fma(&precomp, dst2, src);
+      // cplx_to_tnx32_simple(m, divisor, 18, dst2, src);
+      for (uint64_t i = 0; i < 2 * m; ++i) {
+        double truevalue =
+            (src[i % m][i / m] / divisor - floor(src[i % m][i / m] / divisor + 0.5)) * (INT64_C(1) << 32);
+        if (fabs(truevalue - floor(truevalue)) == 0.5) {
+          // ties can differ by 0, 1 or -1
+          ASSERT_LE(abs(dst1[i] - dst2[i]), 0)
+              << i << " " << dst1[i] << " " << dst2[i] << " " << truevalue << std::endl;
+        } else {
+          // otherwise, we should have equality
+          ASSERT_LE(abs(dst1[i] - dst2[i]), 0)
+              << i << " " << dst1[i] << " " << dst2[i] << " " << truevalue << std::endl;
+        }
+      }
+      spqlios_free(src);
+    }
+  }
+}
+#endif
--- a/spqlios/lib/test/spqlios_cplx_fft_bench.cpp
+++ b/spqlios/lib/test/spqlios_cplx_fft_bench.cpp
@@ -0,0 +1,112 @@
+#include <benchmark/benchmark.h>
+#include <stdint.h>
+
+#include <cassert>
+#include <cmath>
+#include <complex>
+#include <cstdlib>
+#include <cstring>
+#include <iostream>
+#include <string>
+#include <vector>
+
+#include "../spqlios/cplx/cplx_fft_internal.h"
+#include "spqlios/reim/reim_fft.h"
+
+using namespace std;
+
+void init_random_values(uint64_t n, double* v) {
+  for (uint64_t i = 0; i < n; ++i) v[i] = rand() - (RAND_MAX >> 1);
+}
+
+void benchmark_cplx_fft(benchmark::State& state) {
+  const int32_t nn = state.range(0);
+  CPLX_FFT_PRECOMP* a = new_cplx_fft_precomp(nn / 2, 1);
+  double* c = (double*)cplx_fft_precomp_get_buffer(a, 0);
+  init_random_values(nn, c);
+  for (auto _ : state) {
+    // cplx_fft_simple(nn/2, c);
+    cplx_fft(a, c);
+  }
+  delete_cplx_fft_precomp(a);
+}
+
+void benchmark_cplx_ifft(benchmark::State& state) {
+  const int32_t nn = state.range(0);
+  CPLX_IFFT_PRECOMP* a = new_cplx_ifft_precomp(nn / 2, 1);
+  double* c = (double*)cplx_ifft_precomp_get_buffer(a, 0);
+  init_random_values(nn, c);
+  for (auto _ : state) {
+    // cplx_ifft_simple(nn/2, c);
+    cplx_ifft(a, c);
+  }
+  delete_cplx_ifft_precomp(a);
+}
+
+void benchmark_reim_fft(benchmark::State& state) {
+  const int32_t nn = state.range(0);
+  const uint32_t m = nn / 2;
+  REIM_FFT_PRECOMP* a = new_reim_fft_precomp(m, 1);
+  double* c = reim_fft_precomp_get_buffer(a, 0);
+  init_random_values(nn, c);
+  for (auto _ : state) {
+    // cplx_fft_simple(nn/2, c);
+    reim_fft(a, c);
+  }
+  delete_reim_fft_precomp(a);
+}
+
+#ifdef __aarch64__
+EXPORT REIM_FFT_PRECOMP* new_reim_fft_precomp_neon(uint32_t m, uint32_t num_buffers);
+EXPORT void reim_fft_neon(const REIM_FFT_PRECOMP* precomp, double* d);
+
+void benchmark_reim_fft_neon(benchmark::State& state) {
+  const int32_t nn = state.range(0);
+  const uint32_t m = nn / 2;
+  REIM_FFT_PRECOMP* a = new_reim_fft_precomp_neon(m, 1);
+  double* c = reim_fft_precomp_get_buffer(a, 0);
+  init_random_values(nn, c);
+  for (auto _ : state) {
+    // cplx_fft_simple(nn/2, c);
+    reim_fft_neon(a, c);
+  }
+  delete_reim_fft_precomp(a);
+}
+#endif
+
+void benchmark_reim_ifft(benchmark::State& state) {
+  const int32_t nn = state.range(0);
+  const uint32_t m = nn / 2;
+  REIM_IFFT_PRECOMP* a = new_reim_ifft_precomp(m, 1);
+  double* c = reim_ifft_precomp_get_buffer(a, 0);
+  init_random_values(nn, c);
+  for (auto _ : state) {
+    // cplx_ifft_simple(nn/2, c);
+    reim_ifft(a, c);
+  }
+  delete_reim_ifft_precomp(a);
+}
+
+// #define ARGS Arg(1024)->Arg(8192)->Arg(32768)->Arg(65536)
+#define ARGS Arg(64)->Arg(256)->Arg(1024)->Arg(2048)->Arg(4096)->Arg(8192)->Arg(16384)->Arg(32768)->Arg(65536)
+
+int main(int argc, char** argv) {
+  ::benchmark::Initialize(&argc, argv);
+  if (::benchmark::ReportUnrecognizedArguments(argc, argv)) return 1;
+  std::cout << "Dimensions n in the benchmark below are in \"real FFT\" modulo X^n+1" << std::endl;
+  std::cout << "The complex dimension m (modulo X^m-i) is half of it" << std::endl;
+  BENCHMARK(benchmark_cplx_fft)->ARGS;
+  BENCHMARK(benchmark_cplx_ifft)->ARGS;
+  BENCHMARK(benchmark_reim_fft)->ARGS;
+#ifdef __aarch64__
+  BENCHMARK(benchmark_reim_fft_neon)->ARGS;
+#endif
+  BENCHMARK(benchmark_reim_ifft)->ARGS;
+  // if (CPU_SUPPORTS("avx512f")) {
+  //  BENCHMARK(bench_cplx_fftvec_twiddle_avx512)->ARGS;
+  //  BENCHMARK(bench_cplx_fftvec_bitwiddle_avx512)->ARGS;
+  //}
+  ::benchmark::RunSpecifiedBenchmarks();
+  ::benchmark::Shutdown();
+  return 0;
+}
--- a/spqlios/lib/test/spqlios_cplx_test.cpp
+++ b/spqlios/lib/test/spqlios_cplx_test.cpp
@@ -0,0 +1,496 @@
+#include <cmath>
+
+#include "gtest/gtest.h"
+#include "spqlios/commons_private.h"
+#include "spqlios/cplx/cplx_fft.h"
+#include "spqlios/cplx/cplx_fft_internal.h"
+#include "spqlios/cplx/cplx_fft_private.h"
+
+#ifdef __x86_64__
+TEST(fft, ifft16_fma_vs_ref) {
+  CPLX data[16];
+  CPLX omega[8];
+  for (uint64_t i = 0; i < 32; ++i) ((double*)data)[i] = 2 * i + 1;  //(rand()%100)-50;
+  for (uint64_t i = 0; i < 16; ++i) ((double*)omega)[i] = i + 1;     //(rand()%100)-50;
+  CPLX copydata[16];
+  CPLX copyomega[8];
+  memcpy(copydata, data, sizeof(copydata));
+  memcpy(copyomega, omega, sizeof(copyomega));
+  cplx_ifft16_avx_fma(data, omega);
+  cplx_ifft16_ref(copydata, copyomega);
+  double distance = 0;
+  for (uint64_t i = 0; i < 16; ++i) {
+    double d1 = fabs(data[i][0] - copydata[i][0]);
+    double d2 = fabs(data[i][0] - copydata[i][0]);
+    if (d1 > distance) distance = d1;
+    if (d2 > distance) distance = d2;
+  }
+  /*
+  printf("data:\n");
+  for (uint64_t i=0; i<4; ++i) {
+    for (uint64_t j=0; j<8; ++j) {
+      printf("%.5lf ", data[4 * i + j / 2][j % 2]);
+    }
+    printf("\n");
+  }
+  printf("copydata:\n");
+  for (uint64_t i=0; i<4; ++i) {
+    for (uint64_t j=0; j<8; ++j) {
+      printf("%5.5lf ", copydata[4 * i + j / 2][j % 2]);
+    }
+    printf("\n");
+  }
+  */
+  ASSERT_EQ(distance, 0);
+}
+
+#endif
+
+void cplx_zero(CPLX r) { r[0] = r[1] = 0; }
+void cplx_addmul(CPLX r, const CPLX a, const CPLX b) {
+  double re = r[0] + a[0] * b[0] - a[1] * b[1];
+  double im = r[1] + a[0] * b[1] + a[1] * b[0];
+  r[0] = re;
+  r[1] = im;
+}
+
+void halfcfft_eval(CPLX res, uint32_t nn, uint32_t k, const CPLX* coeffs, const CPLX* powomegas) {
+  const uint32_t N = nn / 2;
+  cplx_zero(res);
+  for (uint64_t i = 0; i < N; ++i) {
+    cplx_addmul(res, coeffs[i], powomegas[(k * i) % (2 * nn)]);
+  }
+}
+void halfcfft_naive(uint32_t nn, CPLX* data) {
+  const uint32_t N = nn / 2;
+  CPLX* in = (CPLX*)malloc(N * sizeof(CPLX));
+  CPLX* powomega = (CPLX*)malloc(2 * nn * sizeof(CPLX));
+  for (uint64_t i = 0; i < (2 * nn); ++i) {
+    powomega[i][0] = m_accurate_cos((M_PI * i) / nn);
+    powomega[i][1] = m_accurate_sin((M_PI * i) / nn);
+  }
+  memcpy(in, data, N * sizeof(CPLX));
+  for (uint64_t j = 0; j < N; ++j) {
+    uint64_t p = rint(log2(N)) + 2;
+    uint64_t k = revbits(p, j) + 1;
+    halfcfft_eval(data[j], nn, k, in, powomega);
+  }
+  free(powomega);
+  free(in);
+}
+
+#ifdef __x86_64__
+TEST(fft, fft16_fma_vs_ref) {
+  CPLX data[16];
+  CPLX omega[8];
+  for (uint64_t i = 0; i < 32; ++i) ((double*)data)[i] = rand() % 1000;
+  for (uint64_t i = 0; i < 16; ++i) ((double*)omega)[i] = rand() % 1000;
+  CPLX copydata[16];
+  CPLX copyomega[8];
+  memcpy(copydata, data, sizeof(copydata));
+  memcpy(copyomega, omega, sizeof(copyomega));
+  cplx_fft16_avx_fma(data, omega);
+  cplx_fft16_ref(copydata, omega);
+  double distance = 0;
+  for (uint64_t i = 0; i < 16; ++i) {
+    double d1 = fabs(data[i][0] - copydata[i][0]);
+    double d2 = fabs(data[i][0] - copydata[i][0]);
+    if (d1 > distance) distance = d1;
+    if (d2 > distance) distance = d2;
+  }
+  ASSERT_EQ(distance, 0);
+}
+#endif
+
+TEST(fft, citwiddle_then_invcitwiddle) {
+  CPLX om;
+  CPLX ombar;
+  CPLX data[2];
+  CPLX copydata[2];
+  om[0] = cos(3);
+  om[1] = sin(3);
+  ombar[0] = om[0];
+  ombar[1] = -om[1];
+  data[0][0] = 47;
+  data[0][1] = 23;
+  data[1][0] = -12;
+  data[1][1] = -9;
+  memcpy(copydata, data, sizeof(copydata));
+  citwiddle(data[0], data[1], om);
+  invcitwiddle(data[0], data[1], ombar);
+  double distance = 0;
+  for (uint64_t i = 0; i < 2; ++i) {
+    double d1 = fabs(data[i][0] - 2 * copydata[i][0]);
+    double d2 = fabs(data[i][1] - 2 * copydata[i][1]);
+    if (d1 > distance) distance = d1;
+    if (d2 > distance) distance = d2;
+  }
+  ASSERT_LE(distance, 1e-9);
+}
+
+TEST(fft, ctwiddle_then_invctwiddle) {
+  CPLX om;
+  CPLX ombar;
+  CPLX data[2];
+  CPLX copydata[2];
+  om[0] = cos(3);
+  om[1] = sin(3);
+  ombar[0] = om[0];
+  ombar[1] = -om[1];
+  data[0][0] = 47;
+  data[0][1] = 23;
+  data[1][0] = -12;
+  data[1][1] = -9;
+  memcpy(copydata, data, sizeof(copydata));
+  ctwiddle(data[0], data[1], om);
+  invctwiddle(data[0], data[1], ombar);
+  double distance = 0;
+  for (uint64_t i = 0; i < 2; ++i) {
+    double d1 = fabs(data[i][0] - 2 * copydata[i][0]);
+    double d2 = fabs(data[i][1] - 2 * copydata[i][1]);
+    if (d1 > distance) distance = d1;
+    if (d2 > distance) distance = d2;
+  }
+  ASSERT_LE(distance, 1e-9);
+}
+
+TEST(fft, fft16_then_ifft16_ref) {
+  CPLX full_omegas[64];
+  CPLX full_omegabars[64];
+  for (uint64_t i = 0; i < 64; ++i) {
+    full_omegas[i][0] = cos(M_PI * i / 32.);
+    full_omegas[i][1] = sin(M_PI * i / 32.);
+    full_omegabars[i][0] = full_omegas[i][0];
+    full_omegabars[i][1] = -full_omegas[i][1];
+  }
+  CPLX omega[8];
+  CPLX omegabar[8];
+  cplx_set(omega[0], full_omegas[8]);         // j
+  cplx_set(omega[1], full_omegas[4]);         // k
+  cplx_set(omega[2], full_omegas[2]);         // l
+  cplx_set(omega[3], full_omegas[10]);        // lj
+  cplx_set(omega[4], full_omegas[1]);         // n
+  cplx_set(omega[5], full_omegas[9]);         // nj
+  cplx_set(omega[6], full_omegas[5]);         // nk
+  cplx_set(omega[7], full_omegas[13]);        // njk
+  cplx_set(omegabar[0], full_omegabars[1]);   // n
+  cplx_set(omegabar[1], full_omegabars[9]);   // nj
+  cplx_set(omegabar[2], full_omegabars[5]);   // nk
+  cplx_set(omegabar[3], full_omegabars[13]);  // njk
+  cplx_set(omegabar[4], full_omegabars[2]);   // l
+  cplx_set(omegabar[5], full_omegabars[10]);  // lj
+  cplx_set(omegabar[6], full_omegabars[4]);   // k
+  cplx_set(omegabar[7], full_omegabars[8]);   // j
+  CPLX data[16];
+  CPLX copydata[16];
+  for (uint64_t i = 0; i < 32; ++i) ((double*)data)[i] = rand() % 1000;
+  memcpy(copydata, data, sizeof(copydata));
+  cplx_fft16_ref(data, omega);
+  cplx_ifft16_ref(data, omegabar);
+  double distance = 0;
+  for (uint64_t i = 0; i < 16; ++i) {
+    double d1 = fabs(data[i][0] - 16 * copydata[i][0]);
+    double d2 = fabs(data[i][0] - 16 * copydata[i][0]);
+    if (d1 > distance) distance = d1;
+    if (d2 > distance) distance = d2;
+  }
+  ASSERT_LE(distance, 1e-9);
+}
+
+TEST(fft, halfcfft_ref_vs_naive) {
+  for (uint64_t nn : {4, 8, 16, 64, 256, 8192}) {
+    uint64_t m = nn / 2;
+    CPLX_FFT_PRECOMP* tables = new_cplx_fft_precomp(m, 0);
+    CPLX* a = (CPLX*)spqlios_alloc_custom_align(32, m * sizeof(CPLX));
+    CPLX* a1 = (CPLX*)spqlios_alloc_custom_align(32, m * sizeof(CPLX));
+    CPLX* a2 = (CPLX*)spqlios_alloc_custom_align(32, m * sizeof(CPLX));
+    int64_t p = 1 << 16;
+    for (uint32_t i = 0; i < m; i++) {
+      a[i][0] = (rand() % p) - p / 2;  // between -p/2 and p/2
+      a[i][1] = (rand() % p) - p / 2;
+    }
+    memcpy(a1, a, m * sizeof(CPLX));
+    memcpy(a2, a, m * sizeof(CPLX));
+
+    halfcfft_naive(nn, a1);
+    cplx_fft_naive(m, 0.25, a2);
+
+    double d = 0;
+    for (uint32_t i = 0; i < nn / 2; i++) {
+      double dre = fabs(a1[i][0] - a2[i][0]);
+      double dim = fabs(a1[i][1] - a2[i][1]);
+      if (dre > d) d = dre;
+      if (dim > d) d = dim;
+    }
+    ASSERT_LE(d, nn * 1e-10) << nn;
+    spqlios_free(a);
+    spqlios_free(a1);
+    spqlios_free(a2);
+    delete_cplx_fft_precomp(tables);
+  }
+}
+
+#ifdef __x86_64__
+TEST(fft, halfcfft_fma_vs_ref) {
+  typedef void (*FFTF)(const CPLX_FFT_PRECOMP*, void* data);
+  for (FFTF fft : {cplx_fft_ref, cplx_fft_avx2_fma}) {
+    for (uint64_t nn : {8, 16, 32, 64, 1024, 8192, 65536}) {
+      uint64_t m = nn / 2;
+      CPLX_FFT_PRECOMP* tables = new_cplx_fft_precomp(m, 0);
+      CPLX* a = (CPLX*)spqlios_alloc_custom_align(32, nn / 2 * sizeof(CPLX));
+      CPLX* a1 = (CPLX*)spqlios_alloc_custom_align(32, nn / 2 * sizeof(CPLX));
+      CPLX* a2 = (CPLX*)spqlios_alloc_custom_align(32, nn / 2 * sizeof(CPLX));
+      int64_t p = 1 << 16;
+      for (uint32_t i = 0; i < nn / 2; i++) {
+        a[i][0] = (rand() % p) - p / 2;  // between -p/2 and p/2
+        a[i][1] = (rand() % p) - p / 2;
+      }
+      memcpy(a1, a, nn / 2 * sizeof(CPLX));
+      memcpy(a2, a, nn / 2 * sizeof(CPLX));
+      cplx_fft_naive(m, 0.25, a2);
+      fft(tables, a1);
+      double d = 0;
+      for (uint32_t i = 0; i < nn / 2; i++) {
+        double dre = fabs(a1[i][0] - a2[i][0]);
+        double dim = fabs(a1[i][1] - a2[i][1]);
+        if (dre > d) d = dre;
+        if (dim > d) d = dim;
+      }
+      ASSERT_LE(d, nn * 1e-10) << nn;
+      spqlios_free(a);
+      spqlios_free(a1);
+      spqlios_free(a2);
+      delete_cplx_fft_precomp(tables);
+    }
+  }
+}
+#endif
+
+TEST(fft, halfcfft_then_ifft_ref) {
+  for (uint64_t nn : {4, 8, 16, 32, 64, 1024, 8192, 65536}) {
+    uint64_t m = nn / 2;
+    CPLX_FFT_PRECOMP* tables = new_cplx_fft_precomp(m, 0);
+    CPLX_IFFT_PRECOMP* itables = new_cplx_ifft_precomp(m, 0);
+    CPLX* a = (CPLX*)spqlios_alloc_custom_align(32, nn / 2 * sizeof(CPLX));
+    CPLX* a1 = (CPLX*)spqlios_alloc_custom_align(32, nn / 2 * sizeof(CPLX));
+    int64_t p = 1 << 16;
+    for (uint32_t i = 0; i < nn / 2; i++) {
+      a[i][0] = (rand() % p) - p / 2;  // between -p/2 and p/2
+      a[i][1] = (rand() % p) - p / 2;
+    }
+    memcpy(a1, a, nn / 2 * sizeof(CPLX));
+    cplx_fft_ref(tables, a1);
+    cplx_ifft_ref(itables, a1);
+    double d = 0;
+    for (uint32_t i = 0; i < nn / 2; i++) {
+      double dre = fabs(a[i][0] - a1[i][0] / (nn / 2));
+      double dim = fabs(a[i][1] - a1[i][1] / (nn / 2));
+      if (dre > d) d = dre;
+      if (dim > d) d = dim;
+    }
+    ASSERT_LE(d, 1e-8);
+    spqlios_free(a);
+    spqlios_free(a1);
+    delete_cplx_fft_precomp(tables);
+    delete_cplx_ifft_precomp(itables);
+  }
+}
+
+#ifdef __x86_64__
+TEST(fft, halfcfft_ifft_fma_vs_ref) {
+  for (IFFT_FUNCTION ifft : {cplx_ifft_ref, cplx_ifft_avx2_fma}) {
+    for (uint64_t nn : {8, 16, 32, 1024, 4096, 8192, 65536}) {
+      uint64_t m = nn / 2;
+      CPLX_IFFT_PRECOMP* itables = new_cplx_ifft_precomp(m, 0);
+      CPLX* a = (CPLX*)spqlios_alloc_custom_align(32, nn / 2 * sizeof(CPLX));
+      CPLX* a1 = (CPLX*)spqlios_alloc_custom_align(32, nn / 2 * sizeof(CPLX));
+      CPLX* a2 = (CPLX*)spqlios_alloc_custom_align(32, nn / 2 * sizeof(CPLX));
+      int64_t p = 1 << 16;
+      for (uint32_t i = 0; i < nn / 2; i++) {
+        a[i][0] = (rand() % p) - p / 2;  // between -p/2 and p/2
+        a[i][1] = (rand() % p) - p / 2;
+      }
+      memcpy(a1, a, nn / 2 * sizeof(CPLX));
+      memcpy(a2, a, nn / 2 * sizeof(CPLX));
+      cplx_ifft_naive(m, 0.25, a2);
+      ifft(itables, a1);
+      double d = 0;
+      for (uint32_t i = 0; i < nn / 2; i++) {
+        double dre = fabs(a1[i][0] - a2[i][0]);
+        double dim = fabs(a1[i][1] - a2[i][1]);
+        if (dre > d) d = dre;
+        if (dim > d) d = dim;
+      }
+      ASSERT_LE(d, 1e-8);
+      spqlios_free(a);
+      spqlios_free(a1);
+      spqlios_free(a2);
+      delete_cplx_ifft_precomp(itables);
+    }
+  }
+}
+#endif
+
+// test the reference and simple implementations of mul on all dimensions
+TEST(fftvec, cplx_fftvec_mul_ref) {
+  for (uint64_t nn : {2, 4, 8, 16, 32, 1024, 4096, 8192, 65536}) {
+    uint64_t m = nn / 2;
+    CPLX_FFTVEC_MUL_PRECOMP* precomp = new_cplx_fftvec_mul_precomp(m);
+    CPLX* a = new CPLX[m];
+    CPLX* b = new CPLX[m];
+    CPLX* r0 = new CPLX[m];
+    CPLX* r1 = new CPLX[m];
+    CPLX* r2 = new CPLX[m];
+    int64_t p = 1 << 16;
+    for (uint32_t i = 0; i < m; i++) {
+      a[i][0] = (rand() % p) - p / 2;  // between -p/2 and p/2
+      a[i][1] = (rand() % p) - p / 2;
+      b[i][0] = (rand() % p) - p / 2;  // between -p/2 and p/2
+      b[i][1] = (rand() % p) - p / 2;
+      r2[i][0] = r1[i][0] = r0[i][0] = (rand() % p) - p / 2;  // between -p/2 and p/2
+      r2[i][1] = r1[i][1] = r0[i][1] = (rand() % p) - p / 2;
+    }
+    cplx_fftvec_mul_simple(m, r0, a, b);
+    cplx_fftvec_mul_ref(precomp, r1, a, b);
+    for (uint32_t i = 0; i < m; i++) {
+      r2[i][0] = a[i][0] * b[i][0] - a[i][1] * b[i][1];
+      r2[i][1] = a[i][0] * b[i][1] + a[i][1] * b[i][0];
+      ASSERT_LE(fabs(r1[i][0] - r2[i][0]) + fabs(r1[i][1] - r2[i][1]), 1e-8);
+      ASSERT_LE(fabs(r0[i][0] - r2[i][0]) + fabs(r0[i][1] - r2[i][1]), 1e-8);
+    }
+    delete[] a;
+    delete[] b;
+    delete[] r0;
+    delete[] r1;
+    delete[] r2;
+    delete_cplx_fftvec_mul_precomp(precomp);
+  }
+}
+
+// test the reference and simple implementations of addmul on all dimensions
+TEST(fftvec, cplx_fftvec_addmul_ref) {
+  for (uint64_t nn : {2, 4, 8, 16, 32, 1024, 4096, 8192, 65536}) {
+    uint64_t m = nn / 2;
+    CPLX_FFTVEC_ADDMUL_PRECOMP* precomp = new_cplx_fftvec_addmul_precomp(m);
+    CPLX* a = new CPLX[m];
+    CPLX* b = new CPLX[m];
+    CPLX* r0 = new CPLX[m];
+    CPLX* r1 = new CPLX[m];
+    CPLX* r2 = new CPLX[m];
+    int64_t p = 1 << 16;
+    for (uint32_t i = 0; i < m; i++) {
+      a[i][0] = (rand() % p) - p / 2;  // between -p/2 and p/2
+      a[i][1] = (rand() % p) - p / 2;
+      b[i][0] = (rand() % p) - p / 2;  // between -p/2 and p/2
+      b[i][1] = (rand() % p) - p / 2;
+      r2[i][0] = r1[i][0] = r0[i][0] = (rand() % p) - p / 2;  // between -p/2 and p/2
+      r2[i][1] = r1[i][1] = r0[i][1] = (rand() % p) - p / 2;
+    }
+    cplx_fftvec_addmul_simple(m, r0, a, b);
+    cplx_fftvec_addmul_ref(precomp, r1, a, b);
+    for (uint32_t i = 0; i < m; i++) {
+      r2[i][0] += a[i][0] * b[i][0] - a[i][1] * b[i][1];
+      r2[i][1] += a[i][0] * b[i][1] + a[i][1] * b[i][0];
+      ASSERT_LE(fabs(r1[i][0] - r2[i][0]) + fabs(r1[i][1] - r2[i][1]), 1e-8);
+      ASSERT_LE(fabs(r0[i][0] - r2[i][0]) + fabs(r0[i][1] - r2[i][1]), 1e-8);
+    }
+    delete[] a;
+    delete[] b;
+    delete[] r0;
+    delete[] r1;
+    delete[] r2;
+    delete_cplx_fftvec_addmul_precomp(precomp);
+  }
+}
+
+// comparative tests between mul ref vs. optimized (only relevant dimensions)
+TEST(fftvec, cplx_fftvec_mul_ref_vs_optim) {
+  struct totest {
+    FFTVEC_MUL_FUNCTION f;
+    uint64_t min_m;
+    totest(FFTVEC_MUL_FUNCTION f, uint64_t min_m) : f(f), min_m(min_m) {}
+  };
+  std::vector<totest> totestset;
+  totestset.emplace_back(cplx_fftvec_mul, 1);
+#ifdef __x86_64__
+  totestset.emplace_back(cplx_fftvec_mul_fma, 8);
+#endif
+  for (uint64_t m : {1, 2, 4, 8, 16, 1024, 4096, 8192, 65536}) {
+    CPLX_FFTVEC_MUL_PRECOMP* precomp = new_cplx_fftvec_mul_precomp(m);
+    for (const totest& t : totestset) {
+      if (t.min_m > m) continue;
+      CPLX* a = new CPLX[m];
+      CPLX* b = new CPLX[m];
+      CPLX* r1 = new CPLX[m];
+      CPLX* r2 = new CPLX[m];
+      int64_t p = 1 << 16;
+      for (uint32_t i = 0; i < m; i++) {
+        a[i][0] = (rand() % p) - p / 2;  // between -p/2 and p/2
+        a[i][1] = (rand() % p) - p / 2;
+        b[i][0] = (rand() % p) - p / 2;  // between -p/2 and p/2
+        b[i][1] = (rand() % p) - p / 2;
+        r2[i][0] = r1[i][0] = (rand() % p) - p / 2;  // between -p/2 and p/2
+        r2[i][1] = r1[i][1] = (rand() % p) - p / 2;
+      }
+      t.f(precomp, r1, a, b);
+      cplx_fftvec_mul_ref(precomp, r2, a, b);
+      for (uint32_t i = 0; i < m; i++) {
+        double dre = fabs(r1[i][0] - r2[i][0]);
+        double dim = fabs(r1[i][1] - r2[i][1]);
+        ASSERT_LE(dre, 1e-8);
+        ASSERT_LE(dim, 1e-8);
+      }
+      delete[] a;
+      delete[] b;
+      delete[] r1;
+      delete[] r2;
+    }
+    delete_cplx_fftvec_mul_precomp(precomp);
+  }
+}
+
+// comparative tests between addmul ref vs. optimized (only relevant dimensions)
+TEST(fftvec, cplx_fftvec_addmul_ref_vs_optim) {
+  struct totest {
+    FFTVEC_ADDMUL_FUNCTION f;
+    uint64_t min_m;
+    totest(FFTVEC_ADDMUL_FUNCTION f, uint64_t min_m) : f(f), min_m(min_m) {}
+  };
+  std::vector<totest> totestset;
+  totestset.emplace_back(cplx_fftvec_addmul, 1);
+#ifdef __x86_64__
+  totestset.emplace_back(cplx_fftvec_addmul_fma, 8);
+#endif
+  for (uint64_t m : {1, 2, 4, 8, 16, 1024, 4096, 8192, 65536}) {
+    CPLX_FFTVEC_ADDMUL_PRECOMP* precomp = new_cplx_fftvec_addmul_precomp(m);
+    for (const totest& t : totestset) {
+      if (t.min_m > m) continue;
+      CPLX* a = new CPLX[m];
+      CPLX* b = new CPLX[m];
+      CPLX* r1 = new CPLX[m];
+      CPLX* r2 = new CPLX[m];
+      int64_t p = 1 << 16;
+      for (uint32_t i = 0; i < m; i++) {
+        a[i][0] = (rand() % p) - p / 2;  // between -p/2 and p/2
+        a[i][1] = (rand() % p) - p / 2;
+        b[i][0] = (rand() % p) - p / 2;  // between -p/2 and p/2
+        b[i][1] = (rand() % p) - p / 2;
+        r2[i][0] = r1[i][0] = (rand() % p) - p / 2;  // between -p/2 and p/2
+        r2[i][1] = r1[i][1] = (rand() % p) - p / 2;
+      }
+      t.f(precomp, r1, a, b);
+      cplx_fftvec_addmul_ref(precomp, r2, a, b);
+      for (uint32_t i = 0; i < m; i++) {
+        double dre = fabs(r1[i][0] - r2[i][0]);
+        double dim = fabs(r1[i][1] - r2[i][1]);
+        ASSERT_LE(dre, 1e-8);
+        ASSERT_LE(dim, 1e-8);
+      }
+      delete[] a;
+      delete[] b;
+      delete[] r1;
+      delete[] r2;
+    }
+    delete_cplx_fftvec_addmul_precomp(precomp);
+  }
+}
--- a/spqlios/lib/test/spqlios_q120_arithmetic_bench.cpp
+++ b/spqlios/lib/test/spqlios_q120_arithmetic_bench.cpp
@@ -0,0 +1,136 @@
+#include <benchmark/benchmark.h>
+
+#include <cstdint>
+
+#include "spqlios/q120/q120_arithmetic.h"
+
+#define ARGS Arg(128)->Arg(4096)->Arg(10000)
+
+template <typeof(q120_vec_mat1col_product_baa_ref) f>
+void benchmark_baa(benchmark::State& state) {
+  const uint64_t ell = state.range(0);
+  q120_mat1col_product_baa_precomp* precomp = q120_new_vec_mat1col_product_baa_precomp();
+
+  uint64_t* a = new uint64_t[ell * 4];
+  uint64_t* b = new uint64_t[ell * 4];
+  uint64_t* c = new uint64_t[4];
+  for (uint64_t i = 0; i < 4 * ell; i++) {
+    a[i] = rand();
+    b[i] = rand();
+  }
+  for (auto _ : state) {
+    f(precomp, ell, (q120b*)c, (q120a*)a, (q120a*)b);
+  }
+  delete[] c;
+  delete[] b;
+  delete[] a;
+  q120_delete_vec_mat1col_product_baa_precomp(precomp);
+}
+
+BENCHMARK(benchmark_baa<q120_vec_mat1col_product_baa_ref>)->Name("q120_vec_mat1col_product_baa_ref")->ARGS;
+BENCHMARK(benchmark_baa<q120_vec_mat1col_product_baa_avx2>)->Name("q120_vec_mat1col_product_baa_avx2")->ARGS;
+
+template <typeof(q120_vec_mat1col_product_bbb_ref) f>
+void benchmark_bbb(benchmark::State& state) {
+  const uint64_t ell = state.range(0);
+  q120_mat1col_product_bbb_precomp* precomp = q120_new_vec_mat1col_product_bbb_precomp();
+
+  uint64_t* a = new uint64_t[ell * 4];
+  uint64_t* b = new uint64_t[ell * 4];
+  uint64_t* c = new uint64_t[4];
+  for (uint64_t i = 0; i < 4 * ell; i++) {
+    a[i] = rand();
+    b[i] = rand();
+  }
+  for (auto _ : state) {
+    f(precomp, ell, (q120b*)c, (q120b*)a, (q120b*)b);
+  }
+  delete[] c;
+  delete[] b;
+  delete[] a;
+  q120_delete_vec_mat1col_product_bbb_precomp(precomp);
+}
+
+BENCHMARK(benchmark_bbb<q120_vec_mat1col_product_bbb_ref>)->Name("q120_vec_mat1col_product_bbb_ref")->ARGS;
+BENCHMARK(benchmark_bbb<q120_vec_mat1col_product_bbb_avx2>)->Name("q120_vec_mat1col_product_bbb_avx2")->ARGS;
+
+template <typeof(q120_vec_mat1col_product_bbc_ref) f>
+void benchmark_bbc(benchmark::State& state) {
+  const uint64_t ell = state.range(0);
+  q120_mat1col_product_bbc_precomp* precomp = q120_new_vec_mat1col_product_bbc_precomp();
+
+  uint64_t* a = new uint64_t[ell * 4];
+  uint64_t* b = new uint64_t[ell * 4];
+  uint64_t* c = new uint64_t[4];
+  for (uint64_t i = 0; i < 4 * ell; i++) {
+    a[i] = rand();
+    b[i] = rand();
+  }
+  for (auto _ : state) {
+    f(precomp, ell, (q120b*)c, (q120b*)a, (q120c*)b);
+  }
+  delete[] c;
+  delete[] b;
+  delete[] a;
+  q120_delete_vec_mat1col_product_bbc_precomp(precomp);
+}
+
+BENCHMARK(benchmark_bbc<q120_vec_mat1col_product_bbc_ref>)->Name("q120_vec_mat1col_product_bbc_ref")->ARGS;
+BENCHMARK(benchmark_bbc<q120_vec_mat1col_product_bbc_avx2>)->Name("q120_vec_mat1col_product_bbc_avx2")->ARGS;
+
+EXPORT void q120x2_vec_mat2cols_product_bbc_avx2(q120_mat1col_product_bbc_precomp* precomp, const uint64_t ell,
+                                                 q120b* const res, const q120b* const x, const q120c* const y);
+EXPORT void q120x2_vec_mat1col_product_bbc_avx2(q120_mat1col_product_bbc_precomp* precomp, const uint64_t ell,
+                                                q120b* const res, const q120b* const x, const q120c* const y);
+
+template <typeof(q120_vec_mat1col_product_bbc_ref) f>
+void benchmark_x2c2_bbc(benchmark::State& state) {
+  const uint64_t ell = state.range(0);
+  q120_mat1col_product_bbc_precomp* precomp = q120_new_vec_mat1col_product_bbc_precomp();
+
+  uint64_t* a = new uint64_t[ell * 8];
+  uint64_t* b = new uint64_t[ell * 16];
+  uint64_t* c = new uint64_t[16];
+  for (uint64_t i = 0; i < 8 * ell; i++) {
+    a[i] = rand();
+  }
+  for (uint64_t i = 0; i < 16 * ell; i++) {
+    b[i] = rand();
+  }
+  for (auto _ : state) {
+    f(precomp, ell, (q120b*)c, (q120b*)a, (q120c*)b);
+  }
+  delete[] c;
+  delete[] b;
+  delete[] a;
+  q120_delete_vec_mat1col_product_bbc_precomp(precomp);
+}
+
+BENCHMARK(benchmark_x2c2_bbc<q120x2_vec_mat2cols_product_bbc_avx2>)->Name("q120x2_vec_mat2col_product_bbc_avx2")->ARGS;
+
+template <typeof(q120_vec_mat1col_product_bbc_ref) f>
+void benchmark_x2c1_bbc(benchmark::State& state) {
+  const uint64_t ell = state.range(0);
+  q120_mat1col_product_bbc_precomp* precomp = q120_new_vec_mat1col_product_bbc_precomp();
+
+  uint64_t* a = new uint64_t[ell * 8];
+  uint64_t* b = new uint64_t[ell * 8];
+  uint64_t* c = new uint64_t[8];
+  for (uint64_t i = 0; i < 8 * ell; i++) {
+    a[i] = rand();
+  }
+  for (uint64_t i = 0; i < 8 * ell; i++) {
+    b[i] = rand();
+  }
+  for (auto _ : state) {
+    f(precomp, ell, (q120b*)c, (q120b*)a, (q120c*)b);
+  }
+  delete[] c;
+  delete[] b;
+  delete[] a;
+  q120_delete_vec_mat1col_product_bbc_precomp(precomp);
+}
+
+BENCHMARK(benchmark_x2c1_bbc<q120x2_vec_mat1col_product_bbc_avx2>)->Name("q120x2_vec_mat1col_product_bbc_avx2")->ARGS;
+
+BENCHMARK_MAIN();
--- a/spqlios/lib/test/spqlios_q120_arithmetic_test.cpp
+++ b/spqlios/lib/test/spqlios_q120_arithmetic_test.cpp
@@ -0,0 +1,437 @@
+#include <gtest/gtest.h>
+
+#include <cstdint>
+#include <vector>
+
+#include "spqlios/q120/q120_arithmetic.h"
+#include "test/testlib/negacyclic_polynomial.h"
+#include "test/testlib/ntt120_layouts.h"
+#include "testlib/mod_q120.h"
+
+typedef typeof(q120_vec_mat1col_product_baa_ref) vec_mat1col_product_baa_f;
+
+void test_vec_mat1col_product_baa(vec_mat1col_product_baa_f vec_mat1col_product_baa) {
+  q120_mat1col_product_baa_precomp* precomp = q120_new_vec_mat1col_product_baa_precomp();
+  for (uint64_t ell : {1, 2, 100, 10000}) {
+    std::vector<uint64_t> a(ell * 4);
+    std::vector<uint64_t> b(ell * 4);
+    std::vector<uint64_t> res(4);
+    uint64_t* pa = a.data();
+    uint64_t* pb = b.data();
+    uint64_t* pr = res.data();
+    // generate some random data
+    uniform_q120b(pr);
+    for (uint64_t i = 0; i < ell; ++i) {
+      uniform_q120a(pa + 4 * i);
+      uniform_q120a(pb + 4 * i);
+    }
+    // compute the expected result
+    mod_q120 expect_r;
+    for (uint64_t i = 0; i < ell; ++i) {
+      expect_r += mod_q120::from_q120a(pa + 4 * i) * mod_q120::from_q120a(pb + 4 * i);
+    }
+    // compute the function
+    vec_mat1col_product_baa(precomp, ell, (q120b*)pr, (q120a*)pa, (q120a*)pb);
+    mod_q120 comp_r = mod_q120::from_q120b(pr);
+    // check for equality
+    ASSERT_EQ(comp_r, expect_r) << ell;
+  }
+  q120_delete_vec_mat1col_product_baa_precomp(precomp);
+}
+
+TEST(q120_arithmetic, q120_vec_mat1col_product_baa_ref) {
+  test_vec_mat1col_product_baa(q120_vec_mat1col_product_baa_ref);
+}
+#ifdef __x86_64__
+TEST(q120_arithmetic, q120_vec_mat1col_product_baa_avx2) {
+  test_vec_mat1col_product_baa(q120_vec_mat1col_product_baa_avx2);
+}
+#endif
+
+typedef typeof(q120_vec_mat1col_product_bbb_ref) vec_mat1col_product_bbb_f;
+
+void test_vec_mat1col_product_bbb(vec_mat1col_product_bbb_f vec_mat1col_product_bbb) {
+  q120_mat1col_product_bbb_precomp* precomp = q120_new_vec_mat1col_product_bbb_precomp();
+  for (uint64_t ell : {1, 2, 100, 10000}) {
+    std::vector<uint64_t> a(ell * 4);
+    std::vector<uint64_t> b(ell * 4);
+    std::vector<uint64_t> res(4);
+    uint64_t* pa = a.data();
+    uint64_t* pb = b.data();
+    uint64_t* pr = res.data();
+    // generate some random data
+    uniform_q120b(pr);
+    for (uint64_t i = 0; i < ell; ++i) {
+      uniform_q120b(pa + 4 * i);
+      uniform_q120b(pb + 4 * i);
+    }
+    // compute the expected result
+    mod_q120 expect_r;
+    for (uint64_t i = 0; i < ell; ++i) {
+      expect_r += mod_q120::from_q120b(pa + 4 * i) * mod_q120::from_q120b(pb + 4 * i);
+    }
+    // compute the function
+    vec_mat1col_product_bbb(precomp, ell, (q120b*)pr, (q120b*)pa, (q120b*)pb);
+    mod_q120 comp_r = mod_q120::from_q120b(pr);
+    // check for equality
+    ASSERT_EQ(comp_r, expect_r);
+  }
+  q120_delete_vec_mat1col_product_bbb_precomp(precomp);
+}
+
+TEST(q120_arithmetic, q120_vec_mat1col_product_bbb_ref) {
+  test_vec_mat1col_product_bbb(q120_vec_mat1col_product_bbb_ref);
+}
+#ifdef __x86_64__
+TEST(q120_arithmetic, q120_vec_mat1col_product_bbb_avx2) {
+  test_vec_mat1col_product_bbb(q120_vec_mat1col_product_bbb_avx2);
+}
+#endif
+
+typedef typeof(q120_vec_mat1col_product_bbc_ref) vec_mat1col_product_bbc_f;
+
+void test_vec_mat1col_product_bbc(vec_mat1col_product_bbc_f vec_mat1col_product_bbc) {
+  q120_mat1col_product_bbc_precomp* precomp = q120_new_vec_mat1col_product_bbc_precomp();
+  for (uint64_t ell : {1, 2, 100, 10000}) {
+    std::vector<uint64_t> a(ell * 4);
+    std::vector<uint64_t> b(ell * 4);
+    std::vector<uint64_t> res(4);
+    uint64_t* pa = a.data();
+    uint64_t* pb = b.data();
+    uint64_t* pr = res.data();
+    // generate some random data
+    uniform_q120b(pr);
+    for (uint64_t i = 0; i < ell; ++i) {
+      uniform_q120b(pa + 4 * i);
+      uniform_q120c(pb + 4 * i);
+    }
+    // compute the expected result
+    mod_q120 expect_r;
+    for (uint64_t i = 0; i < ell; ++i) {
+      expect_r += mod_q120::from_q120b(pa + 4 * i) * mod_q120::from_q120c(pb + 4 * i);
+    }
+    // compute the function
+    vec_mat1col_product_bbc(precomp, ell, (q120b*)pr, (q120b*)pa, (q120c*)pb);
+    mod_q120 comp_r = mod_q120::from_q120b(pr);
+    // check for equality
+    ASSERT_EQ(comp_r, expect_r);
+  }
+  q120_delete_vec_mat1col_product_bbc_precomp(precomp);
+}
+
+TEST(q120_arithmetic, q120_vec_mat1col_product_bbc_ref) {
+  test_vec_mat1col_product_bbc(q120_vec_mat1col_product_bbc_ref);
+}
+#ifdef __x86_64__
+TEST(q120_arithmetic, q120_vec_mat1col_product_bbc_avx2) {
+  test_vec_mat1col_product_bbc(q120_vec_mat1col_product_bbc_avx2);
+}
+#endif
+
+EXPORT void q120x2_vec_mat2cols_product_bbc_avx2(q120_mat1col_product_bbc_precomp* precomp, const uint64_t ell,
+                                                 q120b* const res, const q120b* const x, const q120c* const y);
+EXPORT void q120x2_vec_mat1col_product_bbc_avx2(q120_mat1col_product_bbc_precomp* precomp, const uint64_t ell,
+                                                q120b* const res, const q120b* const x, const q120c* const y);
+
+typedef typeof(q120x2_vec_mat2cols_product_bbc_avx2) q120x2_prod_bbc_f;
+
+void test_q120x2_vec_mat2cols_product_bbc(q120x2_prod_bbc_f q120x2_prod_bbc) {
+  q120_mat1col_product_bbc_precomp* precomp = q120_new_vec_mat1col_product_bbc_precomp();
+  for (uint64_t ell : {1, 2, 100, 10000}) {
+    std::vector<uint64_t> a(ell * 8);
+    std::vector<uint64_t> b(ell * 16);
+    std::vector<uint64_t> res(16);
+    uint64_t* pa = a.data();
+    uint64_t* pb = b.data();
+    uint64_t* pr = res.data();
+    // generate some random data
+    uniform_q120b(pr);
+    for (uint64_t i = 0; i < 2 * ell; ++i) {
+      uniform_q120b(pa + 4 * i);
+    }
+    for (uint64_t i = 0; i < 4 * ell; ++i) {
+      uniform_q120c(pb + 4 * i);
+    }
+    // compute the expected result
+    mod_q120 expect_r[4];
+    for (uint64_t i = 0; i < ell; ++i) {
+      mod_q120 va = mod_q120::from_q120b(pa + 8 * i);
+      mod_q120 vb = mod_q120::from_q120b(pa + 8 * i + 4);
+      mod_q120 m1a = mod_q120::from_q120c(pb + 16 * i);
+      mod_q120 m1b = mod_q120::from_q120c(pb + 16 * i + 4);
+      mod_q120 m2a = mod_q120::from_q120c(pb + 16 * i + 8);
+      mod_q120 m2b = mod_q120::from_q120c(pb + 16 * i + 12);
+      expect_r[0] += va * m1a;
+      expect_r[1] += vb * m1b;
+      expect_r[2] += va * m2a;
+      expect_r[3] += vb * m2b;
+    }
+    // compute the function
+    q120x2_prod_bbc(precomp, ell, (q120b*)pr, (q120b*)pa, (q120c*)pb);
+    // check for equality
+    ASSERT_EQ(mod_q120::from_q120b(pr), expect_r[0]);
+    ASSERT_EQ(mod_q120::from_q120b(pr + 4), expect_r[1]);
+    ASSERT_EQ(mod_q120::from_q120b(pr + 8), expect_r[2]);
+    ASSERT_EQ(mod_q120::from_q120b(pr + 12), expect_r[3]);
+  }
+  q120_delete_vec_mat1col_product_bbc_precomp(precomp);
+}
+
+TEST(q120_arithmetic, q120x2_vec_mat2cols_product_bbc_ref) {
+  test_q120x2_vec_mat2cols_product_bbc(q120x2_vec_mat2cols_product_bbc_ref);
+}
+#ifdef __x86_64__
+TEST(q120_arithmetic, q120x2_vec_mat2cols_product_bbc_avx2) {
+  test_q120x2_vec_mat2cols_product_bbc(q120x2_vec_mat2cols_product_bbc_avx2);
+}
+#endif
+
+typedef typeof(q120x2_vec_mat1col_product_bbc_avx2) q120x2_c1_prod_bbc_f;
+
+void test_q120x2_vec_mat1col_product_bbc(q120x2_c1_prod_bbc_f q120x2_c1_prod_bbc) {
+  q120_mat1col_product_bbc_precomp* precomp = q120_new_vec_mat1col_product_bbc_precomp();
+  for (uint64_t ell : {1, 2, 100, 10000}) {
+    std::vector<uint64_t> a(ell * 8);
+    std::vector<uint64_t> b(ell * 8);
+    std::vector<uint64_t> res(8);
+    uint64_t* pa = a.data();
+    uint64_t* pb = b.data();
+    uint64_t* pr = res.data();
+    // generate some random data
+    uniform_q120b(pr);
+    for (uint64_t i = 0; i < 2 * ell; ++i) {
+      uniform_q120b(pa + 4 * i);
+    }
+    for (uint64_t i = 0; i < 2 * ell; ++i) {
+      uniform_q120c(pb + 4 * i);
+    }
+    // compute the expected result
+    mod_q120 expect_r[2];
+    for (uint64_t i = 0; i < ell; ++i) {
+      mod_q120 va = mod_q120::from_q120b(pa + 8 * i);
+      mod_q120 vb = mod_q120::from_q120b(pa + 8 * i + 4);
+      mod_q120 m1a = mod_q120::from_q120c(pb + 8 * i);
+      mod_q120 m1b = mod_q120::from_q120c(pb + 8 * i + 4);
+      expect_r[0] += va * m1a;
+      expect_r[1] += vb * m1b;
+    }
+    // compute the function
+    q120x2_c1_prod_bbc(precomp, ell, (q120b*)pr, (q120b*)pa, (q120c*)pb);
+    // check for equality
+    ASSERT_EQ(mod_q120::from_q120b(pr), expect_r[0]);
+    ASSERT_EQ(mod_q120::from_q120b(pr + 4), expect_r[1]);
+  }
+  q120_delete_vec_mat1col_product_bbc_precomp(precomp);
+}
+
+TEST(q120_arithmetic, q120x2_vec_mat1col_product_bbc_ref) {
+  test_q120x2_vec_mat1col_product_bbc(q120x2_vec_mat1col_product_bbc_ref);
+}
+#ifdef __x86_64__
+TEST(q120_arithmetic, q120x2_vec_mat1col_product_bbc_avx2) {
+  test_q120x2_vec_mat1col_product_bbc(q120x2_vec_mat1col_product_bbc_avx2);
+}
+#endif
+
+typedef typeof(q120x2_extract_1blk_from_q120b_ref) q120x2_extract_f;
+void test_q120x2_extract_1blk(q120x2_extract_f q120x2_extract) {
+  for (uint64_t n : {2, 4, 64}) {
+    ntt120_vec_znx_dft_layout v(n, 1);
+    std::vector<uint64_t> r(8);
+    std::vector<uint64_t> expect(8);
+    for (uint64_t blk = 0; blk < n / 2; ++blk) {
+      for (uint64_t i = 0; i < 8; ++i) {
+        expect[i] = uniform_u64();
+      }
+      memcpy(v.get_blk(0, blk), expect.data(), 8 * sizeof(uint64_t));
+      q120x2_extract_1blk_from_q120b_ref(n, blk, (q120x2b*)r.data(), (q120b*)v.data);
+      ASSERT_EQ(r, expect);
+    }
+  }
+}
+
+TEST(q120_arithmetic, q120x2_extract_1blk_from_q120b_ref) {
+  test_q120x2_extract_1blk(q120x2_extract_1blk_from_q120b_ref);
+}
+
+typedef typeof(q120x2_extract_1blk_from_contiguous_q120b_ref) q120x2_extract_vec_f;
+void test_q120x2_extract_1blk_vec(q120x2_extract_vec_f q120x2_extract) {
+  for (uint64_t n : {2, 4, 32}) {
+    for (uint64_t size : {1, 2, 7}) {
+      ntt120_vec_znx_dft_layout v(n, size);
+      std::vector<uint64_t> r(8 * size);
+      std::vector<uint64_t> expect(8 * size);
+      for (uint64_t blk = 0; blk < n / 2; ++blk) {
+        for (uint64_t i = 0; i < 8 * size; ++i) {
+          expect[i] = uniform_u64();
+        }
+        for (uint64_t i = 0; i < size; ++i) {
+          memcpy(v.get_blk(i, blk), expect.data() + 8 * i, 8 * sizeof(uint64_t));
+        }
+        q120x2_extract(n, size, blk, (q120x2b*)r.data(), (q120b*)v.data);
+        ASSERT_EQ(r, expect);
+      }
+    }
+  }
+}
+
+TEST(q120_arithmetic, q120x2_extract_1blk_from_contiguous_q120b_ref) {
+  test_q120x2_extract_1blk_vec(q120x2_extract_1blk_from_contiguous_q120b_ref);
+}
+
+typedef typeof(q120x2b_save_1blk_to_q120b_ref) q120x2_save_f;
+void test_q120x2_save_1blk(q120x2_save_f q120x2_save) {
+  for (uint64_t n : {2, 4, 64}) {
+    ntt120_vec_znx_dft_layout v(n, 1);
+    std::vector<uint64_t> r(8);
+    std::vector<uint64_t> expect(8);
+    for (uint64_t blk = 0; blk < n / 2; ++blk) {
+      for (uint64_t i = 0; i < 8; ++i) {
+        expect[i] = uniform_u64();
+      }
+      q120x2_save(n, blk, (q120b*)v.data, (q120x2b*)expect.data());
+      memcpy(r.data(), v.get_blk(0, blk), 8 * sizeof(uint64_t));
+      ASSERT_EQ(r, expect);
+    }
+  }
+}
+
+TEST(q120_arithmetic, q120x2b_save_1blk_to_q120b_ref) { test_q120x2_save_1blk(q120x2b_save_1blk_to_q120b_ref); }
+
+TEST(q120_arithmetic, q120_add_bbb_simple) {
+  for (const uint64_t n : {2, 4, 1024}) {
+    std::vector<uint64_t> a(n * 4);
+    std::vector<uint64_t> b(n * 4);
+    std::vector<uint64_t> r(n * 4);
+    uint64_t* pa = a.data();
+    uint64_t* pb = b.data();
+    uint64_t* pr = r.data();
+
+    // generate some random data
+    for (uint64_t i = 0; i < n; ++i) {
+      uniform_q120b(pa + 4 * i);
+      uniform_q120b(pb + 4 * i);
+    }
+
+    // compute the function
+    q120_add_bbb_simple(n, (q120b*)pr, (q120b*)pa, (q120b*)pb);
+
+    for (uint64_t i = 0; i < n; ++i) {
+      mod_q120 ae = mod_q120::from_q120b(pa + 4 * i);
+      mod_q120 be = mod_q120::from_q120b(pb + 4 * i);
+      mod_q120 re = mod_q120::from_q120b(pr + 4 * i);
+
+      ASSERT_EQ(ae + be, re);
+    }
+  }
+}
+
+TEST(q120_arithmetic, q120_add_ccc_simple) {
+  for (const uint64_t n : {2, 4, 1024}) {
+    std::vector<uint64_t> a(n * 4);
+    std::vector<uint64_t> b(n * 4);
+    std::vector<uint64_t> r(n * 4);
+    uint64_t* pa = a.data();
+    uint64_t* pb = b.data();
+    uint64_t* pr = r.data();
+
+    // generate some random data
+    for (uint64_t i = 0; i < n; ++i) {
+      uniform_q120c(pa + 4 * i);
+      uniform_q120c(pb + 4 * i);
+    }
+
+    // compute the function
+    q120_add_ccc_simple(n, (q120c*)pr, (q120c*)pa, (q120c*)pb);
+
+    for (uint64_t i = 0; i < n; ++i) {
+      mod_q120 ae = mod_q120::from_q120c(pa + 4 * i);
+      mod_q120 be = mod_q120::from_q120c(pb + 4 * i);
+      mod_q120 re = mod_q120::from_q120c(pr + 4 * i);
+
+      ASSERT_EQ(ae + be, re);
+    }
+  }
+}
+
+TEST(q120_arithmetic, q120_c_from_b_simple) {
+  for (const uint64_t n : {2, 4, 1024}) {
+    std::vector<uint64_t> a(n * 4);
+    std::vector<uint64_t> r(n * 4);
+    uint64_t* pa = a.data();
+    uint64_t* pr = r.data();
+
+    // generate some random data
+    for (uint64_t i = 0; i < n; ++i) {
+      uniform_q120b(pa + 4 * i);
+    }
+
+    // compute the function
+    q120_c_from_b_simple(n, (q120c*)pr, (q120b*)pa);
+
+    for (uint64_t i = 0; i < n; ++i) {
+      mod_q120 ae = mod_q120::from_q120b(pa + 4 * i);
+      mod_q120 re = mod_q120::from_q120c(pr + 4 * i);
+
+      ASSERT_EQ(ae, re);
+    }
+  }
+}
+
+TEST(q120_arithmetic, q120_b_from_znx64_simple) {
+  for (const uint64_t n : {2, 4, 1024}) {
+    znx_i64 x = znx_i64::random_log2bound(n, 62);
+    std::vector<uint64_t> r(n * 4);
+    uint64_t* pr = r.data();
+
+    q120_b_from_znx64_simple(n, (q120b*)pr, x.data());
+
+    for (uint64_t i = 0; i < n; ++i) {
+      mod_q120 re = mod_q120::from_q120b(pr + 4 * i);
+
+      for (uint64_t k = 0; k < 4; ++k) {
+        ASSERT_EQ(centermod(x.get_coeff(i), mod_q120::Qi[k]), re.a[k]);
+      }
+    }
+  }
+}
+
+TEST(q120_arithmetic, q120_c_from_znx64_simple) {
+  for (const uint64_t n : {2, 4, 1024}) {
+    znx_i64 x = znx_i64::random(n);
+    std::vector<uint64_t> r(n * 4);
+    uint64_t* pr = r.data();
+
+    q120_c_from_znx64_simple(n, (q120c*)pr, x.data());
+
+    for (uint64_t i = 0; i < n; ++i) {
+      mod_q120 re = mod_q120::from_q120c(pr + 4 * i);
+
+      for (uint64_t k = 0; k < 4; ++k) {
+        ASSERT_EQ(centermod(x.get_coeff(i), mod_q120::Qi[k]), re.a[k]);
+      }
+    }
+  }
+}
+
+TEST(q120_arithmetic, q120_b_to_znx128_simple) {
+  for (const uint64_t n : {2, 4, 1024}) {
+    std::vector<uint64_t> x(n * 4);
+    uint64_t* px = x.data();
+
+    // generate some random data
+    for (uint64_t i = 0; i < n; ++i) {
+      uniform_q120b(px + 4 * i);
+    }
+
+    znx_i128 r(n);
+    q120_b_to_znx128_simple(n, r.data(), (q120b*)px);
+
+    for (uint64_t i = 0; i < n; ++i) {
+      mod_q120 xe = mod_q120::from_q120b(px + 4 * i);
+      for (uint64_t k = 0; k < 4; ++k) {
+        ASSERT_EQ(centermod((int64_t)(r.get_coeff(i) % mod_q120::Qi[k]), mod_q120::Qi[k]), xe.a[k]);
+      }
+    }
+  }
+}
--- a/spqlios/lib/test/spqlios_q120_ntt_bench.cpp
+++ b/spqlios/lib/test/spqlios_q120_ntt_bench.cpp
@@ -0,0 +1,44 @@
+#include <benchmark/benchmark.h>
+
+#include <cstdint>
+
+#include "spqlios/q120/q120_ntt.h"
+
+#define ARGS Arg(1 << 10)->Arg(1 << 11)->Arg(1 << 12)->Arg(1 << 13)->Arg(1 << 14)->Arg(1 << 15)->Arg(1 << 16)
+
+template <typeof(q120_ntt_bb_avx2) f>
+void benchmark_ntt(benchmark::State& state) {
+  const uint64_t n = state.range(0);
+  q120_ntt_precomp* precomp = q120_new_ntt_bb_precomp(n);
+
+  uint64_t* px = new uint64_t[n * 4];
+  for (uint64_t i = 0; i < 4 * n; i++) {
+    px[i] = (rand() << 31) + rand();
+  }
+  for (auto _ : state) {
+    f(precomp, (q120b*)px);
+  }
+  delete[] px;
+  q120_del_ntt_bb_precomp(precomp);
+}
+
+template <typeof(q120_intt_bb_avx2) f>
+void benchmark_intt(benchmark::State& state) {
+  const uint64_t n = state.range(0);
+  q120_ntt_precomp* precomp = q120_new_intt_bb_precomp(n);
+
+  uint64_t* px = new uint64_t[n * 4];
+  for (uint64_t i = 0; i < 4 * n; i++) {
+    px[i] = (rand() << 31) + rand();
+  }
+  for (auto _ : state) {
+    f(precomp, (q120b*)px);
+  }
+  delete[] px;
+  q120_del_intt_bb_precomp(precomp);
+}
+
+BENCHMARK(benchmark_ntt<q120_ntt_bb_avx2>)->Name("q120_ntt_bb_avx2")->ARGS;
+BENCHMARK(benchmark_intt<q120_intt_bb_avx2>)->Name("q120_intt_bb_avx2")->ARGS;
+
+BENCHMARK_MAIN();
--- a/spqlios/lib/test/spqlios_q120_ntt_test.cpp
+++ b/spqlios/lib/test/spqlios_q120_ntt_test.cpp
@@ -0,0 +1,174 @@
+#include <gtest/gtest.h>
+
+#include <cassert>
+#include <cstdint>
+#include <cstdio>
+#include <random>
+#include <vector>
+
+#include "spqlios/q120/q120_common.h"
+#include "spqlios/q120/q120_ntt.h"
+#include "testlib/mod_q120.h"
+
+std::vector<mod_q120> q120_ntt(const std::vector<mod_q120>& x) {
+  const uint64_t n = x.size();
+
+  mod_q120 omega_2pow17{OMEGA1, OMEGA2, OMEGA3, OMEGA4};
+  mod_q120 omega = pow(omega_2pow17, (1 << 16) / n);
+
+  std::vector<mod_q120> res(n);
+  for (uint64_t i = 0; i < n; ++i) {
+    res[i] = x[i];
+  }
+
+  for (uint64_t i = 0; i < n; ++i) {
+    res[i] = res[i] * pow(omega, i);
+  }
+
+  for (uint64_t nn = n; nn > 1; nn /= 2) {
+    const uint64_t halfnn = nn / 2;
+    const uint64_t m = n / halfnn;
+
+    for (uint64_t j = 0; j < n; j += nn) {
+      for (uint64_t k = 0; k < halfnn; ++k) {
+        mod_q120 a = res[j + k];
+        mod_q120 b = res[j + halfnn + k];
+
+        res[j + k] = a + b;
+        res[j + halfnn + k] = (a - b) * pow(omega, k * m);
+      }
+    }
+  }
+
+  return res;
+}
+
+std::vector<mod_q120> q120_intt(const std::vector<mod_q120>& x) {
+  const uint64_t n = x.size();
+
+  mod_q120 omega_2pow17{OMEGA1, OMEGA2, OMEGA3, OMEGA4};
+  mod_q120 omega = pow(omega_2pow17, (1 << 16) / n);
+
+  std::vector<mod_q120> res(n);
+  for (uint64_t i = 0; i < n; ++i) {
+    res[i] = x[i];
+  }
+
+  for (uint64_t nn = 2; nn <= n; nn *= 2) {
+    const uint64_t halfnn = nn / 2;
+    const uint64_t m = n / halfnn;
+
+    for (uint64_t j = 0; j < n; j += nn) {
+      for (uint64_t k = 0; k < halfnn; ++k) {
+        mod_q120 a = res[j + k];
+        mod_q120 b = res[j + halfnn + k];
+
+        mod_q120 bo = b * pow(omega, -k * m);
+        res[j + k] = a + bo;
+        res[j + halfnn + k] = a - bo;
+      }
+    }
+  }
+
+  mod_q120 n_q120{(int64_t)n, (int64_t)n, (int64_t)n, (int64_t)n};
+  mod_q120 n_inv_q120 = pow(n_q120, -1);
+
+  for (uint64_t i = 0; i < n; ++i) {
+    mod_q120 po = pow(omega, -i) * n_inv_q120;
+    res[i] = res[i] * po;
+  }
+
+  return res;
+}
+
+class ntt : public testing::TestWithParam<uint64_t> {};
+
+#ifdef __x86_64__
+
+TEST_P(ntt, q120_ntt_bb_avx2) {
+  const uint64_t n = GetParam();
+  q120_ntt_precomp* precomp = q120_new_ntt_bb_precomp(n);
+
+  std::vector<uint64_t> x(n * 4);
+  uint64_t* px = x.data();
+  for (uint64_t i = 0; i < 4 * n; i += 4) {
+    uniform_q120b(px + i);
+  }
+
+  std::vector<mod_q120> x_modq(n);
+  for (uint64_t i = 0; i < n; ++i) {
+    x_modq[i] = mod_q120::from_q120b(px + 4 * i);
+  }
+
+  std::vector<mod_q120> y_exp = q120_ntt(x_modq);
+
+  q120_ntt_bb_avx2(precomp, (q120b*)px);
+
+  for (uint64_t i = 0; i < n; ++i) {
+    mod_q120 comp_r = mod_q120::from_q120b(px + 4 * i);
+    ASSERT_EQ(comp_r, y_exp[i]) << i;
+  }
+
+  q120_del_ntt_bb_precomp(precomp);
+}
+
+TEST_P(ntt, q120_intt_bb_avx2) {
+  const uint64_t n = GetParam();
+  q120_ntt_precomp* precomp = q120_new_intt_bb_precomp(n);
+
+  std::vector<uint64_t> x(n * 4);
+  uint64_t* px = x.data();
+  for (uint64_t i = 0; i < 4 * n; i += 4) {
+    uniform_q120b(px + i);
+  }
+
+  std::vector<mod_q120> x_modq(n);
+  for (uint64_t i = 0; i < n; ++i) {
+    x_modq[i] = mod_q120::from_q120b(px + 4 * i);
+  }
+
+  q120_intt_bb_avx2(precomp, (q120b*)px);
+
+  std::vector<mod_q120> y_exp = q120_intt(x_modq);
+  for (uint64_t i = 0; i < n; ++i) {
+    mod_q120 comp_r = mod_q120::from_q120b(px + 4 * i);
+    ASSERT_EQ(comp_r, y_exp[i]) << i;
+  }
+
+  q120_del_intt_bb_precomp(precomp);
+}
+
+TEST_P(ntt, q120_ntt_intt_bb_avx2) {
+  const uint64_t n = GetParam();
+  q120_ntt_precomp* precomp_ntt = q120_new_ntt_bb_precomp(n);
+  q120_ntt_precomp* precomp_intt = q120_new_intt_bb_precomp(n);
+
+  std::vector<uint64_t> x(n * 4);
+  uint64_t* px = x.data();
+  for (uint64_t i = 0; i < 4 * n; i += 4) {
+    uniform_q120b(px + i);
+  }
+
+  std::vector<mod_q120> x_modq(n);
+  for (uint64_t i = 0; i < n; ++i) {
+    x_modq[i] = mod_q120::from_q120b(px + 4 * i);
+  }
+
+  q120_ntt_bb_avx2(precomp_ntt, (q120b*)px);
+  q120_intt_bb_avx2(precomp_intt, (q120b*)px);
+
+  for (uint64_t i = 0; i < n; ++i) {
+    mod_q120 comp_r = mod_q120::from_q120b(px + 4 * i);
+    ASSERT_EQ(comp_r, x_modq[i]) << i;
+  }
+
+  q120_del_intt_bb_precomp(precomp_intt);
+  q120_del_ntt_bb_precomp(precomp_ntt);
+}
+
+INSTANTIATE_TEST_SUITE_P(q120, ntt,
+                         testing::Values(1, 2, 4, 16, 256, UINT64_C(1) << 10, UINT64_C(1) << 11, UINT64_C(1) << 12,
+                                         UINT64_C(1) << 13, UINT64_C(1) << 14, UINT64_C(1) << 15, UINT64_C(1) << 16),
+                         testing::PrintToStringParamName());
+
+#endif
--- a/spqlios/lib/test/spqlios_reim4_arithmetic_bench.cpp
+++ b/spqlios/lib/test/spqlios_reim4_arithmetic_bench.cpp
@@ -0,0 +1,52 @@
+#include <benchmark/benchmark.h>
+
+#include "spqlios/reim4/reim4_arithmetic.h"
+
+void init_random_values(uint64_t n, double* v) {
+  for (uint64_t i = 0; i < n; ++i)
+    v[i] = (double(rand() % (UINT64_C(1) << 14)) - (UINT64_C(1) << 13)) / (UINT64_C(1) << 12);
+}
+
+// Run the benchmark
+BENCHMARK_MAIN();
+
+#undef ARGS
+#define ARGS Args({47, 16384})->Args({93, 32768})
+
+/*
+ *  reim4_vec_mat1col_product
+ *  reim4_vec_mat2col_product
+ *  reim4_vec_mat3col_product
+ *  reim4_vec_mat4col_product
+ */
+
+template <uint64_t X,
+          void (*fnc)(const uint64_t nrows, double* const dst, const double* const u, const double* const v)>
+void benchmark_reim4_vec_matXcols_product(benchmark::State& state) {
+  const uint64_t nrows = state.range(0);
+
+  double* u = new double[nrows * 8];
+  init_random_values(8 * nrows, u);
+  double* v = new double[nrows * X * 8];
+  init_random_values(X * 8 * nrows, v);
+  double* dst = new double[X * 8];
+
+  for (auto _ : state) {
+    fnc(nrows, dst, u, v);
+  }
+
+  delete[] dst;
+  delete[] v;
+  delete[] u;
+}
+
+#undef ARGS
+#define ARGS Arg(128)->Arg(1024)->Arg(4096)
+
+#ifdef __x86_64__
+BENCHMARK(benchmark_reim4_vec_matXcols_product<1, reim4_vec_mat1col_product_avx2>)->ARGS;
+// TODO: please remove when fixed:
+BENCHMARK(benchmark_reim4_vec_matXcols_product<2, reim4_vec_mat2cols_product_avx2>)->ARGS;
+#endif
+BENCHMARK(benchmark_reim4_vec_matXcols_product<1, reim4_vec_mat1col_product_ref>)->ARGS;
+BENCHMARK(benchmark_reim4_vec_matXcols_product<2, reim4_vec_mat2cols_product_ref>)->ARGS;
--- a/spqlios/lib/test/spqlios_reim4_arithmetic_test.cpp
+++ b/spqlios/lib/test/spqlios_reim4_arithmetic_test.cpp
@@ -0,0 +1,253 @@
+#include <gtest/gtest.h>
+
+#include <iostream>
+#include <random>
+
+#include "../spqlios/reim4/reim4_arithmetic.h"
+#include "test/testlib/reim4_elem.h"
+
+/// Actual tests
+
+typedef typeof(reim4_extract_1blk_from_reim_ref) reim4_extract_1blk_from_reim_f;
+void test_reim4_extract_1blk_from_reim(reim4_extract_1blk_from_reim_f reim4_extract_1blk_from_reim) {
+  static const uint64_t numtrials = 100;
+  for (uint64_t m : {4, 8, 16, 1024, 4096, 32768}) {
+    double* v = (double*)malloc(2 * m * sizeof(double));
+    double* w = (double*)malloc(8 * sizeof(double));
+    reim_view vv(m, v);
+    for (uint64_t i = 0; i < numtrials; ++i) {
+      reim4_elem el = gaussian_reim4();
+      uint64_t blk = rand() % (m / 4);
+      vv.set_blk(blk, el);
+      reim4_extract_1blk_from_reim(m, blk, w, v);
+      reim4_elem actual(w);
+      ASSERT_EQ(el, actual);
+    }
+    free(v);
+    free(w);
+  }
+}
+
+TEST(reim4_arithmetic, reim4_extract_1blk_from_reim_ref) {
+  test_reim4_extract_1blk_from_reim(reim4_extract_1blk_from_reim_ref);
+}
+#ifdef __x86_64__
+TEST(reim4_arithmetic, reim4_extract_1blk_from_reim_avx) {
+  test_reim4_extract_1blk_from_reim(reim4_extract_1blk_from_reim_avx);
+}
+#endif
+
+typedef typeof(reim4_save_1blk_to_reim_ref) reim4_save_1blk_to_reim_f;
+void test_reim4_save_1blk_to_reim(reim4_save_1blk_to_reim_f reim4_save_1blk_to_reim) {
+  static const uint64_t numtrials = 100;
+  for (uint64_t m : {4, 8, 16, 1024, 4096, 32768}) {
+    double* v = (double*)malloc(2 * m * sizeof(double));
+    double* w = (double*)malloc(8 * sizeof(double));
+    reim_view vv(m, v);
+    for (uint64_t i = 0; i < numtrials; ++i) {
+      reim4_elem el = gaussian_reim4();
+      el.save_as(w);
+      uint64_t blk = rand() % (m / 4);
+      reim4_save_1blk_to_reim_ref(m, blk, v, w);
+      reim4_elem actual = vv.get_blk(blk);
+      ASSERT_EQ(el, actual);
+    }
+    free(v);
+    free(w);
+  }
+}
+
+TEST(reim4_arithmetic, reim4_save_1blk_to_reim_ref) { test_reim4_save_1blk_to_reim(reim4_save_1blk_to_reim_ref); }
+#ifdef __x86_64__
+TEST(reim4_arithmetic, reim4_save_1blk_to_reim_avx) { test_reim4_save_1blk_to_reim(reim4_save_1blk_to_reim_avx); }
+#endif
+
+typedef typeof(reim4_extract_1blk_from_contiguous_reim_ref) reim4_extract_1blk_from_contiguous_reim_f;
+void test_reim4_extract_1blk_from_contiguous_reim(
+    reim4_extract_1blk_from_contiguous_reim_f reim4_extract_1blk_from_contiguous_reim) {
+  static const uint64_t numtrials = 20;
+  for (uint64_t m : {4, 8, 16, 1024, 4096, 32768}) {
+    for (uint64_t nrows : {1, 2, 5, 128}) {
+      double* v = (double*)malloc(2 * m * nrows * sizeof(double));
+      double* w = (double*)malloc(8 * nrows * sizeof(double));
+      reim_vector_view vv(m, nrows, v);
+      reim4_array_view ww(nrows, w);
+      for (uint64_t i = 0; i < numtrials; ++i) {
+        uint64_t blk = rand() % (m / 4);
+        for (uint64_t j = 0; j < nrows; ++j) {
+          reim4_elem el = gaussian_reim4();
+          vv.row(j).set_blk(blk, el);
+        }
+        reim4_extract_1blk_from_contiguous_reim_ref(m, nrows, blk, w, v);
+        for (uint64_t j = 0; j < nrows; ++j) {
+          reim4_elem el = vv.row(j).get_blk(blk);
+          reim4_elem actual = ww.get(j);
+          ASSERT_EQ(el, actual);
+        }
+      }
+      free(v);
+      free(w);
+    }
+  }
+}
+
+TEST(reim4_arithmetic, reim4_extract_1blk_from_contiguous_reim_ref) {
+  test_reim4_extract_1blk_from_contiguous_reim(reim4_extract_1blk_from_contiguous_reim_ref);
+}
+#ifdef __x86_64__
+TEST(reim4_arithmetic, reim4_extract_1blk_from_contiguous_reim_avx) {
+  test_reim4_extract_1blk_from_contiguous_reim(reim4_extract_1blk_from_contiguous_reim_avx);
+}
+#endif
+
+// test of basic arithmetic functions
+
+TEST(reim4_arithmetic, add) {
+  reim4_elem x = gaussian_reim4();
+  reim4_elem y = gaussian_reim4();
+  reim4_elem expect = x + y;
+  reim4_elem actual;
+  reim4_add(actual.value, x.value, y.value);
+  ASSERT_EQ(actual, expect);
+}
+
+TEST(reim4_arithmetic, mul) {
+  reim4_elem x = gaussian_reim4();
+  reim4_elem y = gaussian_reim4();
+  reim4_elem expect = x * y;
+  reim4_elem actual;
+  reim4_mul(actual.value, x.value, y.value);
+  ASSERT_EQ(actual, expect);
+}
+
+TEST(reim4_arithmetic, add_mul) {
+  reim4_elem x = gaussian_reim4();
+  reim4_elem y = gaussian_reim4();
+  reim4_elem z = gaussian_reim4();
+  reim4_elem expect = z;
+  reim4_elem actual = z;
+  expect += x * y;
+  reim4_add_mul(actual.value, x.value, y.value);
+  ASSERT_EQ(actual, expect) << infty_dist(expect, actual);
+}
+
+// test of dot products
+
+typedef typeof(reim4_vec_mat1col_product_ref) reim4_vec_mat1col_product_f;
+void test_reim4_vec_mat1col_product(reim4_vec_mat1col_product_f product) {
+  for (uint64_t ell : {1, 2, 5, 13, 69, 129}) {
+    std::vector<double> actual(8);
+    std::vector<double> a(ell * 8);
+    std::vector<double> b(ell * 8);
+    reim4_array_view va(ell, a.data());
+    reim4_array_view vb(ell, b.data());
+    reim4_array_view vactual(1, actual.data());
+    // initialize random values
+    for (uint64_t i = 0; i < ell; ++i) {
+      va.set(i, gaussian_reim4());
+      vb.set(i, gaussian_reim4());
+    }
+    // compute the mat1col product
+    reim4_elem expect;
+    for (uint64_t i = 0; i < ell; ++i) {
+      expect += va.get(i) * vb.get(i);
+    }
+    // compute the actual product
+    product(ell, actual.data(), a.data(), b.data());
+    // compare
+    ASSERT_LE(infty_dist(vactual.get(0), expect), 1e-10);
+  }
+}
+
+TEST(reim4_arithmetic, reim4_vec_mat1col_product_ref) { test_reim4_vec_mat1col_product(reim4_vec_mat1col_product_ref); }
+#ifdef __x86_64__
+TEST(reim4_arena, reim4_vec_mat1col_product_avx2) { test_reim4_vec_mat1col_product(reim4_vec_mat1col_product_avx2); }
+#endif
+
+typedef typeof(reim4_vec_mat2cols_product_ref) reim4_vec_mat2col_product_f;
+void test_reim4_vec_mat2cols_product(reim4_vec_mat2col_product_f product) {
+  for (uint64_t ell : {1, 2, 5, 13, 69, 129}) {
+    std::vector<double> actual(16);
+    std::vector<double> a(ell * 8);
+    std::vector<double> b(ell * 16);
+    reim4_array_view va(ell, a.data());
+    reim4_matrix_view vb(ell, 2, b.data());
+    reim4_array_view vactual(2, actual.data());
+    // initialize random values
+    for (uint64_t i = 0; i < ell; ++i) {
+      va.set(i, gaussian_reim4());
+      vb.set(i, 0, gaussian_reim4());
+      vb.set(i, 1, gaussian_reim4());
+    }
+    // compute the mat1col product
+    reim4_elem expect[2];
+    for (uint64_t i = 0; i < ell; ++i) {
+      expect[0] += va.get(i) * vb.get(i, 0);
+      expect[1] += va.get(i) * vb.get(i, 1);
+    }
+    // compute the actual product
+    product(ell, actual.data(), a.data(), b.data());
+    // compare
+    ASSERT_LE(infty_dist(vactual.get(0), expect[0]), 1e-10);
+    ASSERT_LE(infty_dist(vactual.get(1), expect[1]), 1e-10);
+  }
+}
+
+TEST(reim4_arithmetic, reim4_vec_mat2cols_product_ref) {
+  test_reim4_vec_mat2cols_product(reim4_vec_mat2cols_product_ref);
+}
+#ifdef __x86_64__
+TEST(reim4_arithmetic, reim4_vec_mat2cols_product_avx2) {
+  test_reim4_vec_mat2cols_product(reim4_vec_mat2cols_product_avx2);
+}
+#endif
+
+// for now, we do not need avx implementations,
+// so we will keep a single test function
+TEST(reim4_arithmetic, reim4_vec_convolution_ref) {
+  for (uint64_t sizea : {1, 2, 3, 5, 8}) {
+    for (uint64_t sizeb : {1, 3, 6, 9, 13}) {
+      std::vector<double> a(8 * sizea);
+      std::vector<double> b(8 * sizeb);
+      std::vector<double> expect(8 * (sizea + sizeb - 1));
+      std::vector<double> actual(8 * (sizea + sizeb - 1));
+      reim4_array_view va(sizea, a.data());
+      reim4_array_view vb(sizeb, b.data());
+      std::vector<reim4_elem> vexpect(sizea + sizeb + 3);
+      reim4_array_view vactual(sizea + sizeb - 1, actual.data());
+      for (uint64_t i = 0; i < sizea; ++i) {
+        va.set(i, gaussian_reim4());
+      }
+      for (uint64_t j = 0; j < sizeb; ++j) {
+        vb.set(j, gaussian_reim4());
+      }
+      // manual convolution
+      for (uint64_t i = 0; i < sizea; ++i) {
+        for (uint64_t j = 0; j < sizeb; ++j) {
+          vexpect[i + j] += va.get(i) * vb.get(j);
+        }
+      }
+      // partial convolution single coeff
+      for (uint64_t k = 0; k < sizea + sizeb + 3; ++k) {
+        double dest[8] = {0};
+        reim4_convolution_1coeff_ref(k, dest, a.data(), sizea, b.data(), sizeb);
+        ASSERT_LE(infty_dist(reim4_elem(dest), vexpect[k]), 1e-10);
+      }
+      // partial convolution dual coeff
+      for (uint64_t k = 0; k < sizea + sizeb + 2; ++k) {
+        double dest[16] = {0};
+        reim4_convolution_2coeff_ref(k, dest, a.data(), sizea, b.data(), sizeb);
+        ASSERT_LE(infty_dist(reim4_elem(dest), vexpect[k]), 1e-10);
+        ASSERT_LE(infty_dist(reim4_elem(dest + 8), vexpect[k + 1]), 1e-10);
+      }
+      // actual convolution
+      reim4_convolution_ref(actual.data(), sizea + sizeb - 1, 0, a.data(), sizea, b.data(), sizeb);
+      for (uint64_t k = 0; k < sizea + sizeb - 1; ++k) {
+        ASSERT_LE(infty_dist(vactual.get(k), vexpect[k]), 1e-10) << k;
+      }
+    }
+  }
+}
+
+EXPORT void reim4_convolution_ref(double* dest, uint64_t dest_size, uint64_t dest_offset, const double* a,
+                                  uint64_t sizea, const double* b, uint64_t sizeb);
--- a/spqlios/lib/test/spqlios_reim_conversions_test.cpp
+++ b/spqlios/lib/test/spqlios_reim_conversions_test.cpp
@@ -0,0 +1,115 @@
+#include <gtest/gtest.h>
+#include <spqlios/reim/reim_fft_internal.h>
+
+#include "testlib/test_commons.h"
+
+TEST(reim_conversions, reim_to_tnx) {
+  for (uint32_t m : {1, 2, 64, 128, 512}) {
+    for (double divisor : {1, 2, int(m)}) {
+      for (uint32_t log2overhead : {1, 2, 10, 18, 35, 42}) {
+        double maxdiff = pow(2., log2overhead - 50);
+        std::vector<double> data(2 * m);
+        std::vector<double> dout(2 * m);
+        for (uint64_t i = 0; i < 2 * m; ++i) {
+          data[i] = (uniform_f64_01() - 0.5) * pow(2., log2overhead + 1) * divisor;
+        }
+        REIM_TO_TNX_PRECOMP* p = new_reim_to_tnx_precomp(m, divisor, 18);
+        reim_to_tnx_ref(p, dout.data(), data.data());
+        for (uint64_t i = 0; i < 2 * m; ++i) {
+          ASSERT_LE(fabs(dout[i]), 0.5);
+          double diff = dout[i] - data[i] / divisor;
+          double fracdiff = diff - rint(diff);
+          ASSERT_LE(fabs(fracdiff), maxdiff);
+        }
+        delete_reim_to_tnx_precomp(p);
+      }
+    }
+  }
+}
+
+#ifdef __x86_64__
+TEST(reim_conversions, reim_to_tnx_ref_vs_avx) {
+  for (uint32_t m : {8, 16, 64, 128, 512}) {
+    for (double divisor : {1, 2, int(m)}) {
+      for (uint32_t log2overhead : {1, 2, 10, 18, 35, 42}) {
+        // double maxdiff = pow(2., log2overhead - 50);
+        std::vector<double> data(2 * m);
+        std::vector<double> dout1(2 * m);
+        std::vector<double> dout2(2 * m);
+        for (uint64_t i = 0; i < 2 * m; ++i) {
+          data[i] = (uniform_f64_01() - 0.5) * pow(2., log2overhead + 1) * divisor;
+        }
+        REIM_TO_TNX_PRECOMP* p = new_reim_to_tnx_precomp(m, divisor, 18);
+        reim_to_tnx_ref(p, dout1.data(), data.data());
+        reim_to_tnx_avx(p, dout2.data(), data.data());
+        for (uint64_t i = 0; i < 2 * m; ++i) {
+          ASSERT_LE(fabs(dout1[i] - dout2[i]), 0.);
+        }
+        delete_reim_to_tnx_precomp(p);
+      }
+    }
+  }
+}
+#endif
+
+typedef typeof(reim_from_znx64_ref) reim_from_znx64_f;
+
+void test_reim_from_znx64(reim_from_znx64_f reim_from_znx64, uint64_t maxbnd) {
+  for (uint32_t m : {4, 8, 16, 64, 16384}) {
+    REIM_FROM_ZNX64_PRECOMP* p = new_reim_from_znx64_precomp(m, maxbnd);
+    std::vector<int64_t> data(2 * m);
+    std::vector<double> dout(2 * m);
+    for (uint64_t i = 0; i < 2 * m; ++i) {
+      int64_t magnitude = int64_t(uniform_u64() % (maxbnd + 1));
+      data[i] = uniform_i64() >> (63 - magnitude);
+      REQUIRE_DRAMATICALLY(abs(data[i]) <= (INT64_C(1) << magnitude), "pb");
+    }
+    reim_from_znx64(p, dout.data(), data.data());
+    for (uint64_t i = 0; i < 2 * m; ++i) {
+      ASSERT_EQ(dout[i], double(data[i])) << dout[i] << " " << data[i];
+    }
+    delete_reim_from_znx64_precomp(p);
+  }
+}
+
+TEST(reim_conversions, reim_from_znx64) {
+  for (uint64_t maxbnd : {50}) {
+    test_reim_from_znx64(reim_from_znx64, maxbnd);
+  }
+}
+TEST(reim_conversions, reim_from_znx64_ref) { test_reim_from_znx64(reim_from_znx64_ref, 50); }
+#ifdef __x86_64__
+TEST(reim_conversions, reim_from_znx64_avx2_bnd50_fma) { test_reim_from_znx64(reim_from_znx64_bnd50_fma, 50); }
+#endif
+
+typedef typeof(reim_to_znx64_ref) reim_to_znx64_f;
+
+void test_reim_to_znx64(reim_to_znx64_f reim_to_znx64_fcn, int64_t maxbnd) {
+  for (uint32_t m : {4, 8, 16, 64, 16384}) {
+    for (double divisor : {1, 2, int(m)}) {
+      REIM_TO_ZNX64_PRECOMP* p = new_reim_to_znx64_precomp(m, divisor, maxbnd);
+      std::vector<double> data(2 * m);
+      std::vector<int64_t> dout(2 * m);
+      for (uint64_t i = 0; i < 2 * m; ++i) {
+        int64_t magnitude = int64_t(uniform_u64() % (maxbnd + 11)) - 10;
+        data[i] = (uniform_f64_01() - 0.5) * pow(2., magnitude + 1) * divisor;
+      }
+      reim_to_znx64_fcn(p, dout.data(), data.data());
+      for (uint64_t i = 0; i < 2 * m; ++i) {
+        ASSERT_LE(dout[i] - data[i] / divisor, 0.5) << dout[i] << " " << data[i];
+      }
+      delete_reim_to_znx64_precomp(p);
+    }
+  }
+}
+
+TEST(reim_conversions, reim_to_znx64) {
+  for (uint64_t maxbnd : {63, 50}) {
+    test_reim_to_znx64(reim_to_znx64, maxbnd);
+  }
+}
+TEST(reim_conversions, reim_to_znx64_ref) { test_reim_to_znx64(reim_to_znx64_ref, 63); }
+#ifdef __x86_64__
+TEST(reim_conversions, reim_to_znx64_avx2_bnd63_fma) { test_reim_to_znx64(reim_to_znx64_avx2_bnd63_fma, 63); }
+TEST(reim_conversions, reim_to_znx64_avx2_bnd50_fma) { test_reim_to_znx64(reim_to_znx64_avx2_bnd50_fma, 50); }
+#endif
--- a/spqlios/lib/test/spqlios_reim_test.cpp
+++ b/spqlios/lib/test/spqlios_reim_test.cpp
@@ -0,0 +1,477 @@
+#include <inttypes.h>
+
+#include <cmath>
+
+#include "gtest/gtest.h"
+#include "spqlios/commons_private.h"
+#include "spqlios/cplx/cplx_fft_internal.h"
+#include "spqlios/reim/reim_fft_internal.h"
+#include "spqlios/reim/reim_fft_private.h"
+
+#ifdef __x86_64__
+TEST(fft, reim_fft_avx2_vs_fft_reim_ref) {
+  for (uint64_t nn : {16, 32, 64, 1024, 8192, 65536}) {
+    uint64_t m = nn / 2;
+    // CPLX_FFT_PRECOMP* tables = new_cplx_fft_precomp(m, 0);
+    REIM_FFT_PRECOMP* reimtables = new_reim_fft_precomp(m, 0);
+    CPLX* a = (CPLX*)spqlios_alloc_custom_align(32, nn / 2 * sizeof(CPLX));
+    double* a1 = (double*)spqlios_alloc_custom_align(32, nn / 2 * sizeof(CPLX));
+    double* a2 = (double*)spqlios_alloc_custom_align(32, nn / 2 * sizeof(CPLX));
+    int64_t p = 1 << 16;
+    for (uint32_t i = 0; i < nn / 2; i++) {
+      a[i][0] = (rand() % p) - p / 2;  // between -p/2 and p/2
+      a[i][1] = (rand() % p) - p / 2;
+    }
+    memcpy(a1, a, nn / 2 * sizeof(CPLX));
+    memcpy(a2, a, nn / 2 * sizeof(CPLX));
+    reim_fft_ref(reimtables, a2);
+    reim_fft_avx2_fma(reimtables, a1);
+    double d = 0;
+    for (uint32_t i = 0; i < nn / 2; i++) {
+      double dre = fabs(a1[i] - a2[i]);
+      double dim = fabs(a1[nn / 2 + i] - a2[nn / 2 + i]);
+      if (dre > d) d = dre;
+      if (dim > d) d = dim;
+      ASSERT_LE(d, nn * 1e-10) << nn;
+    }
+    ASSERT_LE(d, nn * 1e-10) << nn;
+    spqlios_free(a);
+    spqlios_free(a1);
+    spqlios_free(a2);
+    // delete_cplx_fft_precomp(tables);
+    delete_reim_fft_precomp(reimtables);
+  }
+}
+#endif
+
+#ifdef __x86_64__
+TEST(fft, reim_ifft_avx2_vs_reim_ifft_ref) {
+  for (uint64_t nn : {16, 32, 64, 1024, 8192, 65536}) {
+    uint64_t m = nn / 2;
+    // CPLX_FFT_PRECOMP* tables = new_cplx_fft_precomp(m, 0);
+    REIM_IFFT_PRECOMP* reimtables = new_reim_ifft_precomp(m, 0);
+    CPLX* a = (CPLX*)spqlios_alloc_custom_align(32, nn / 2 * sizeof(CPLX));
+    double* a1 = (double*)spqlios_alloc_custom_align(32, nn / 2 * sizeof(CPLX));
+    double* a2 = (double*)spqlios_alloc_custom_align(32, nn / 2 * sizeof(CPLX));
+    int64_t p = 1 << 16;
+    for (uint32_t i = 0; i < nn / 2; i++) {
+      a[i][0] = (rand() % p) - p / 2;  // between -p/2 and p/2
+      a[i][1] = (rand() % p) - p / 2;
+    }
+    memcpy(a1, a, nn / 2 * sizeof(CPLX));
+    memcpy(a2, a, nn / 2 * sizeof(CPLX));
+    reim_ifft_ref(reimtables, a2);
+    reim_ifft_avx2_fma(reimtables, a1);
+    double d = 0;
+    for (uint32_t i = 0; i < nn / 2; i++) {
+      double dre = fabs(a1[i] - a2[i]);
+      double dim = fabs(a1[nn / 2 + i] - a2[nn / 2 + i]);
+      if (dre > d) d = dre;
+      if (dim > d) d = dim;
+      ASSERT_LE(d, 1e-8);
+    }
+    ASSERT_LE(d, 1e-8);
+    spqlios_free(a);
+    spqlios_free(a1);
+    spqlios_free(a2);
+    // delete_cplx_fft_precomp(tables);
+    delete_reim_fft_precomp(reimtables);
+  }
+}
+#endif
+
+#ifdef __x86_64__
+TEST(fft, reim_vecfft_addmul_fma_vs_ref) {
+  for (uint64_t nn : {16, 32, 64, 1024, 8192, 65536}) {
+    uint64_t m = nn / 2;
+    REIM_FFTVEC_ADDMUL_PRECOMP* tbl = new_reim_fftvec_addmul_precomp(m);
+    ASSERT_TRUE(tbl != nullptr);
+    double* a1 = (double*)spqlios_alloc_custom_align(32, nn / 2 * sizeof(CPLX));
+    double* a2 = (double*)spqlios_alloc_custom_align(32, nn / 2 * sizeof(CPLX));
+    double* b1 = (double*)spqlios_alloc_custom_align(32, nn / 2 * sizeof(CPLX));
+    double* b2 = (double*)spqlios_alloc_custom_align(32, nn / 2 * sizeof(CPLX));
+    double* r1 = (double*)spqlios_alloc_custom_align(32, nn / 2 * sizeof(CPLX));
+    double* r2 = (double*)spqlios_alloc_custom_align(32, nn / 2 * sizeof(CPLX));
+    int64_t p = 1 << 16;
+    for (uint32_t i = 0; i < nn; i++) {
+      a1[i] = (rand() % p) - p / 2;  // between -p/2 and p/2
+      b1[i] = (rand() % p) - p / 2;
+      r1[i] = (rand() % p) - p / 2;
+    }
+    memcpy(a2, a1, nn / 2 * sizeof(CPLX));
+    memcpy(b2, b1, nn / 2 * sizeof(CPLX));
+    memcpy(r2, r1, nn / 2 * sizeof(CPLX));
+    reim_fftvec_addmul_ref(tbl, r1, a1, b1);
+    reim_fftvec_addmul_fma(tbl, r2, a2, b2);
+    double d = 0;
+    for (uint32_t i = 0; i < nn; i++) {
+      double di = fabs(r1[i] - r2[i]);
+      if (di > d) d = di;
+      ASSERT_LE(d, 1e-8);
+    }
+    ASSERT_LE(d, 1e-8);
+    spqlios_free(a1);
+    spqlios_free(a2);
+    spqlios_free(b1);
+    spqlios_free(b2);
+    spqlios_free(r1);
+    spqlios_free(r2);
+    delete_reim_fftvec_addmul_precomp(tbl);
+  }
+}
+#endif
+
+#ifdef __x86_64__
+TEST(fft, reim_vecfft_mul_fma_vs_ref) {
+  for (uint64_t nn : {16, 32, 64, 1024, 8192, 65536}) {
+    uint64_t m = nn / 2;
+    REIM_FFTVEC_MUL_PRECOMP* tbl = new_reim_fftvec_mul_precomp(m);
+    double* a1 = (double*)spqlios_alloc_custom_align(32, nn / 2 * sizeof(CPLX));
+    double* a2 = (double*)spqlios_alloc_custom_align(32, nn / 2 * sizeof(CPLX));
+    double* b1 = (double*)spqlios_alloc_custom_align(32, nn / 2 * sizeof(CPLX));
+    double* b2 = (double*)spqlios_alloc_custom_align(32, nn / 2 * sizeof(CPLX));
+    double* r1 = (double*)spqlios_alloc_custom_align(32, nn / 2 * sizeof(CPLX));
+    double* r2 = (double*)spqlios_alloc_custom_align(32, nn / 2 * sizeof(CPLX));
+    int64_t p = 1 << 16;
+    for (uint32_t i = 0; i < nn; i++) {
+      a1[i] = (rand() % p) - p / 2;  // between -p/2 and p/2
+      b1[i] = (rand() % p) - p / 2;
+      r1[i] = (rand() % p) - p / 2;
+    }
+    memcpy(a2, a1, nn / 2 * sizeof(CPLX));
+    memcpy(b2, b1, nn / 2 * sizeof(CPLX));
+    memcpy(r2, r1, nn / 2 * sizeof(CPLX));
+    reim_fftvec_mul_ref(tbl, r1, a1, b1);
+    reim_fftvec_mul_fma(tbl, r2, a2, b2);
+    double d = 0;
+    for (uint32_t i = 0; i < nn; i++) {
+      double di = fabs(r1[i] - r2[i]);
+      if (di > d) d = di;
+      ASSERT_LE(d, 1e-8);
+    }
+    ASSERT_LE(d, 1e-8);
+    spqlios_free(a1);
+    spqlios_free(a2);
+    spqlios_free(b1);
+    spqlios_free(b2);
+    spqlios_free(r1);
+    spqlios_free(r2);
+    delete_reim_fftvec_mul_precomp(tbl);
+  }
+}
+#endif
+
+typedef void (*FILL_REIM_FFT_OMG_F)(const double entry_pwr, double** omg);
+typedef void (*REIM_FFT_F)(double* dre, double* dim, const void* omega);
+
+// template to test a fixed-dimension fft vs. naive
+template <uint64_t N>
+void test_reim_fft_ref_vs_naive(FILL_REIM_FFT_OMG_F fill_omega_f, REIM_FFT_F reim_fft_f) {
+  double om[N];
+  double data[2 * N];
+  double datacopy[2 * N];
+  double* omg = om;
+  fill_omega_f(0.25, &omg);
+  ASSERT_EQ(omg - om, ptrdiff_t(N));  // it may depend on N
+  for (uint64_t i = 0; i < N; ++i) {
+    datacopy[i] = data[i] = (rand() % 100) - 50;
+    datacopy[N + i] = data[N + i] = (rand() % 100) - 50;
+  }
+  reim_fft_f(datacopy, datacopy + N, om);
+  reim_naive_fft(N, 0.25, data, data + N);
+  double d = 0;
+  for (uint64_t i = 0; i < 2 * N; ++i) {
+    d += fabs(datacopy[i] - data[i]);
+  }
+  ASSERT_LE(d, 1e-7);
+}
+
+template <uint64_t N>
+void test_reim_fft_ref_vs_accel(REIM_FFT_F reim_fft_ref_f, REIM_FFT_F reim_fft_accel_f) {
+  double om[N];
+  double data[2 * N];
+  double datacopy[2 * N];
+  for (uint64_t i = 0; i < N; ++i) {
+    om[i] = (rand() % 100) - 50;
+    datacopy[i] = data[i] = (rand() % 100) - 50;
+    datacopy[N + i] = data[N + i] = (rand() % 100) - 50;
+  }
+  reim_fft_ref_f(datacopy, datacopy + N, om);
+  reim_fft_accel_f(data, data + N, om);
+  double d = 0;
+  for (uint64_t i = 0; i < 2 * N; ++i) {
+    d += fabs(datacopy[i] - data[i]);
+  }
+  if (d > 1e-15) {
+    for (uint64_t i = 0; i < N; ++i) {
+      printf("%" PRId64 " %lf %lf %lf %lf\n", i, data[i], data[N + i], datacopy[i], datacopy[N + i]);
+    }
+    ASSERT_LE(d, 0);
+  }
+}
+
+TEST(fft, reim_fft16_ref_vs_naive) { test_reim_fft_ref_vs_naive<16>(fill_reim_fft16_omegas, reim_fft16_ref); }
+#ifdef __aarch64__
+TEST(fft, reim_fft16_neon_vs_naive) { test_reim_fft_ref_vs_naive<16>(fill_reim_fft16_omegas_neon, reim_fft16_neon); }
+#endif
+
+#ifdef __x86_64__
+TEST(fft, reim_fft16_ref_vs_fma) { test_reim_fft_ref_vs_accel<16>(reim_fft16_ref, reim_fft16_avx_fma); }
+#endif
+
+#ifdef __aarch64__
+static void reim_fft16_ref_neon_pom(double* dre, double* dim, const void* omega) {
+  const double* pom = (double*) omega;
+  // put the omegas in neon order
+  double x_pom[] = {
+    pom[0], pom[1], pom[2], pom[3],
+    pom[4],pom[5], pom[6], pom[7],
+    pom[8], pom[10],pom[12], pom[14],
+    pom[9], pom[11],pom[13], pom[15]
+  };
+  reim_fft16_ref(dre, dim, x_pom);
+}
+TEST(fft, reim_fft16_ref_vs_neon) { test_reim_fft_ref_vs_accel<16>(reim_fft16_ref_neon_pom, reim_fft16_neon); }
+#endif
+
+TEST(fft, reim_fft8_ref_vs_naive) { test_reim_fft_ref_vs_naive<8>(fill_reim_fft8_omegas, reim_fft8_ref); }
+
+#ifdef __x86_64__
+TEST(fft, reim_fft8_ref_vs_fma) { test_reim_fft_ref_vs_accel<8>(reim_fft8_ref, reim_fft8_avx_fma); }
+#endif
+
+TEST(fft, reim_fft4_ref_vs_naive) { test_reim_fft_ref_vs_naive<4>(fill_reim_fft4_omegas, reim_fft4_ref); }
+
+#ifdef __x86_64__
+TEST(fft, reim_fft4_ref_vs_fma) { test_reim_fft_ref_vs_accel<4>(reim_fft4_ref, reim_fft4_avx_fma); }
+#endif
+
+TEST(fft, reim_fft2_ref_vs_naive) { test_reim_fft_ref_vs_naive<2>(fill_reim_fft2_omegas, reim_fft2_ref); }
+
+TEST(fft, reim_fft_bfs_16_ref_vs_naive) {
+  for (const uint64_t m : {16, 32, 64, 128, 256, 512, 1024, 2048}) {
+    std::vector<double> om(2 * m);
+    std::vector<double> data(2 * m);
+    std::vector<double> datacopy(2 * m);
+    double* omg = om.data();
+    fill_reim_fft_bfs_16_omegas(m, 0.25, &omg);
+    ASSERT_LE(omg - om.data(), ptrdiff_t(2 * m));  // it may depend on m
+    for (uint64_t i = 0; i < m; ++i) {
+      datacopy[i] = data[i] = (rand() % 100) - 50;
+      datacopy[m + i] = data[m + i] = (rand() % 100) - 50;
+    }
+    omg = om.data();
+    reim_fft_bfs_16_ref(m, datacopy.data(), datacopy.data() + m, &omg);
+    reim_naive_fft(m, 0.25, data.data(), data.data() + m);
+    double d = 0;
+    for (uint64_t i = 0; i < 2 * m; ++i) {
+      d += fabs(datacopy[i] - data[i]);
+    }
+    ASSERT_LE(d, 1e-7);
+  }
+}
+
+TEST(fft, reim_fft_rec_16_ref_vs_naive) {
+  for (const uint64_t m : {2048, 4096, 8192, 32768, 65536}) {
+    std::vector<double> om(2 * m);
+    std::vector<double> data(2 * m);
+    std::vector<double> datacopy(2 * m);
+    double* omg = om.data();
+    fill_reim_fft_rec_16_omegas(m, 0.25, &omg);
+    ASSERT_LE(omg - om.data(), ptrdiff_t(2 * m));  // it may depend on m
+    for (uint64_t i = 0; i < m; ++i) {
+      datacopy[i] = data[i] = (rand() % 100) - 50;
+      datacopy[m + i] = data[m + i] = (rand() % 100) - 50;
+    }
+    omg = om.data();
+    reim_fft_rec_16_ref(m, datacopy.data(), datacopy.data() + m, &omg);
+    reim_naive_fft(m, 0.25, data.data(), data.data() + m);
+    double d = 0;
+    for (uint64_t i = 0; i < 2 * m; ++i) {
+      d += fabs(datacopy[i] - data[i]);
+    }
+    ASSERT_LE(d, 1e-5);
+  }
+}
+
+TEST(fft, reim_fft_ref_vs_naive) {
+  for (const uint64_t m : {1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 32768, 65536}) {
+    std::vector<double> om(2 * m);
+    std::vector<double> data(2 * m);
+    std::vector<double> datacopy(2 * m);
+    REIM_FFT_PRECOMP* precomp = new_reim_fft_precomp(m, 0);
+    for (uint64_t i = 0; i < m; ++i) {
+      datacopy[i] = data[i] = (rand() % 100) - 50;
+      datacopy[m + i] = data[m + i] = (rand() % 100) - 50;
+    }
+    reim_fft_ref(precomp, datacopy.data());
+    reim_naive_fft(m, 0.25, data.data(), data.data() + m);
+    double d = 0;
+    for (uint64_t i = 0; i < 2 * m; ++i) {
+      d += fabs(datacopy[i] - data[i]);
+    }
+    ASSERT_LE(d, 1e-5) << m;
+    delete_reim_fft_precomp(precomp);
+  }
+}
+
+#ifdef __aarch64__
+EXPORT REIM_FFT_PRECOMP* new_reim_fft_precomp_neon(uint32_t m, uint32_t num_buffers);
+EXPORT void reim_fft_neon(const REIM_FFT_PRECOMP* precomp, double* d);
+TEST(fft, reim_fft_neon_vs_naive) {
+  for (const uint64_t m : {1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 32768, 65536}) {
+    std::vector<double> om(2 * m);
+    std::vector<double> data(2 * m);
+    std::vector<double> datacopy(2 * m);
+    REIM_FFT_PRECOMP* precomp = new_reim_fft_precomp_neon(m, 0);
+    for (uint64_t i = 0; i < m; ++i) {
+      datacopy[i] = data[i] = (rand() % 100) - 50;
+      datacopy[m + i] = data[m + i] = (rand() % 100) - 50;
+    }
+    reim_fft_neon(precomp, datacopy.data());
+    reim_naive_fft(m, 0.25, data.data(), data.data() + m);
+    double d = 0;
+    for (uint64_t i = 0; i < 2 * m; ++i) {
+      d += fabs(datacopy[i] - data[i]);
+    }
+    ASSERT_LE(d, 1e-5) << m;
+    delete_reim_fft_precomp(precomp);
+  }
+}
+#endif
+
+typedef void (*FILL_REIM_IFFT_OMG_F)(const double entry_pwr, double** omg);
+typedef void (*REIM_IFFT_F)(double* dre, double* dim, const void* omega);
+
+// template to test a fixed-dimension fft vs. naive
+template <uint64_t N>
+void test_reim_ifft_ref_vs_naive(FILL_REIM_IFFT_OMG_F fill_omega_f, REIM_IFFT_F reim_ifft_f) {
+  double om[N];
+  double data[2 * N];
+  double datacopy[2 * N];
+  double* omg = om;
+  fill_omega_f(0.25, &omg);
+  ASSERT_EQ(omg - om, ptrdiff_t(N));  // it may depend on N
+  for (uint64_t i = 0; i < N; ++i) {
+    datacopy[i] = data[i] = (rand() % 100) - 50;
+    datacopy[N + i] = data[N + i] = (rand() % 100) - 50;
+  }
+  reim_ifft_f(datacopy, datacopy + N, om);
+  reim_naive_ifft(N, 0.25, data, data + N);
+  double d = 0;
+  for (uint64_t i = 0; i < 2 * N; ++i) {
+    d += fabs(datacopy[i] - data[i]);
+  }
+  ASSERT_LE(d, 1e-7);
+}
+
+template <uint64_t N>
+void test_reim_ifft_ref_vs_accel(REIM_IFFT_F reim_ifft_ref_f, REIM_IFFT_F reim_ifft_accel_f) {
+  double om[N];
+  double data[2 * N];
+  double datacopy[2 * N];
+  for (uint64_t i = 0; i < N; ++i) {
+    om[i] = (rand() % 100) - 50;
+    datacopy[i] = data[i] = (rand() % 100) - 50;
+    datacopy[N + i] = data[N + i] = (rand() % 100) - 50;
+  }
+  reim_ifft_ref_f(datacopy, datacopy + N, om);
+  reim_ifft_accel_f(data, data + N, om);
+  double d = 0;
+  for (uint64_t i = 0; i < 2 * N; ++i) {
+    d += fabs(datacopy[i] - data[i]);
+  }
+  if (d > 1e-15) {
+    for (uint64_t i = 0; i < N; ++i) {
+      printf("%" PRId64 " %lf %lf %lf %lf\n", i, data[i], data[N + i], datacopy[i], datacopy[N + i]);
+    }
+    ASSERT_LE(d, 0);
+  }
+}
+
+TEST(fft, reim_ifft16_ref_vs_naive) { test_reim_ifft_ref_vs_naive<16>(fill_reim_ifft16_omegas, reim_ifft16_ref); }
+
+#ifdef __x86_64__
+TEST(fft, reim_ifft16_ref_vs_fma) { test_reim_ifft_ref_vs_accel<16>(reim_ifft16_ref, reim_ifft16_avx_fma); }
+#endif
+
+TEST(fft, reim_ifft8_ref_vs_naive) { test_reim_ifft_ref_vs_naive<8>(fill_reim_ifft8_omegas, reim_ifft8_ref); }
+
+#ifdef __x86_64__
+TEST(fft, reim_ifft8_ref_vs_fma) { test_reim_ifft_ref_vs_accel<8>(reim_ifft8_ref, reim_ifft8_avx_fma); }
+#endif
+
+TEST(fft, reim_ifft4_ref_vs_naive) { test_reim_ifft_ref_vs_naive<4>(fill_reim_ifft4_omegas, reim_ifft4_ref); }
+
+#ifdef __x86_64__
+TEST(fft, reim_ifft4_ref_vs_fma) { test_reim_ifft_ref_vs_accel<4>(reim_ifft4_ref, reim_ifft4_avx_fma); }
+#endif
+
+TEST(fft, reim_ifft2_ref_vs_naive) { test_reim_ifft_ref_vs_naive<2>(fill_reim_ifft2_omegas, reim_ifft2_ref); }
+
+TEST(fft, reim_ifft_bfs_16_ref_vs_naive) {
+  for (const uint64_t m : {16, 32, 64, 128, 256, 512, 1024, 2048}) {
+    std::vector<double> om(2 * m);
+    std::vector<double> data(2 * m);
+    std::vector<double> datacopy(2 * m);
+    double* omg = om.data();
+    fill_reim_ifft_bfs_16_omegas(m, 0.25, &omg);
+    ASSERT_LE(omg - om.data(), ptrdiff_t(2 * m));  // it may depend on m
+    for (uint64_t i = 0; i < m; ++i) {
+      datacopy[i] = data[i] = (rand() % 100) - 50;
+      datacopy[m + i] = data[m + i] = (rand() % 100) - 50;
+    }
+    omg = om.data();
+    reim_ifft_bfs_16_ref(m, datacopy.data(), datacopy.data() + m, &omg);
+    reim_naive_ifft(m, 0.25, data.data(), data.data() + m);
+    double d = 0;
+    for (uint64_t i = 0; i < 2 * m; ++i) {
+      d += fabs(datacopy[i] - data[i]);
+    }
+    ASSERT_LE(d, 1e-7);
+  }
+}
+
+TEST(fft, reim_ifft_rec_16_ref_vs_naive) {
+  for (const uint64_t m : {2048, 4096, 8192, 32768, 65536}) {
+    std::vector<double> om(2 * m);
+    std::vector<double> data(2 * m);
+    std::vector<double> datacopy(2 * m);
+    double* omg = om.data();
+    fill_reim_ifft_rec_16_omegas(m, 0.25, &omg);
+    ASSERT_LE(omg - om.data(), ptrdiff_t(2 * m));  // it may depend on m
+    for (uint64_t i = 0; i < m; ++i) {
+      datacopy[i] = data[i] = (rand() % 100) - 50;
+      datacopy[m + i] = data[m + i] = (rand() % 100) - 50;
+    }
+    omg = om.data();
+    reim_ifft_rec_16_ref(m, datacopy.data(), datacopy.data() + m, &omg);
+    reim_naive_ifft(m, 0.25, data.data(), data.data() + m);
+    double d = 0;
+    for (uint64_t i = 0; i < 2 * m; ++i) {
+      d += fabs(datacopy[i] - data[i]);
+    }
+    ASSERT_LE(d, 1e-5);
+  }
+}
+
+TEST(fft, reim_ifft_ref_vs_naive) {
+  for (const uint64_t m : {1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 32768, 65536}) {
+    std::vector<double> om(2 * m);
+    std::vector<double> data(2 * m);
+    std::vector<double> datacopy(2 * m);
+    REIM_IFFT_PRECOMP* precomp = new_reim_ifft_precomp(m, 0);
+    for (uint64_t i = 0; i < m; ++i) {
+      datacopy[i] = data[i] = (rand() % 100) - 50;
+      datacopy[m + i] = data[m + i] = (rand() % 100) - 50;
+    }
+    reim_ifft_ref(precomp, datacopy.data());
+    reim_naive_ifft(m, 0.25, data.data(), data.data() + m);
+    double d = 0;
+    for (uint64_t i = 0; i < 2 * m; ++i) {
+      d += fabs(datacopy[i] - data[i]);
+    }
+    ASSERT_LE(d, 1e-5) << m;
+    delete_reim_ifft_precomp(precomp);
+  }
+}
--- a/spqlios/lib/test/spqlios_svp_product_test.cpp
+++ b/spqlios/lib/test/spqlios_svp_product_test.cpp
@@ -0,0 +1,28 @@
+#include <gtest/gtest.h>
+
+#include "../spqlios/arithmetic/vec_znx_arithmetic_private.h"
+#include "testlib/fft64_dft.h"
+#include "testlib/fft64_layouts.h"
+#include "testlib/polynomial_vector.h"
+
+// todo: remove when registered
+typedef typeof(fft64_svp_prepare_ref) SVP_PREPARE_F;
+
+void test_fft64_svp_prepare(SVP_PREPARE_F svp_prepare) {
+  for (uint64_t n : {2, 4, 8, 64, 128}) {
+    MODULE* module = new_module_info(n, FFT64);
+    znx_i64 in = znx_i64::random_log2bound(n, 40);
+    fft64_svp_ppol_layout out(n);
+    reim_fft64vec expect = simple_fft64(in);
+    svp_prepare(module, out.data, in.data());
+    const double* ed = (double*)expect.data();
+    const double* ac = (double*)out.data;
+    for (uint64_t i = 0; i < n; ++i) {
+      ASSERT_LE(abs(ed[i] - ac[i]), 1e-10) << i << n;
+    }
+    delete_module_info(module);
+  }
+}
+
+TEST(svp_prepare, fft64_svp_prepare_ref) { test_fft64_svp_prepare(fft64_svp_prepare_ref); }
+TEST(svp_prepare, svp_prepare) { test_fft64_svp_prepare(svp_prepare); }
--- a/spqlios/lib/test/spqlios_svp_test.cpp
+++ b/spqlios/lib/test/spqlios_svp_test.cpp
@@ -0,0 +1,47 @@
+#include <gtest/gtest.h>
+
+#include "../spqlios/arithmetic/vec_znx_arithmetic_private.h"
+#include "testlib/fft64_dft.h"
+#include "testlib/fft64_layouts.h"
+#include "testlib/polynomial_vector.h"
+
+void test_fft64_svp_apply_dft(SVP_APPLY_DFT_F svp) {
+  for (uint64_t n : {2, 4, 8, 64, 128}) {
+    MODULE* module = new_module_info(n, FFT64);
+    // poly 1 to multiply - create and prepare
+    fft64_svp_ppol_layout ppol(n);
+    ppol.fill_random(1.);
+    for (uint64_t sa : {3, 5, 8}) {
+      for (uint64_t sr : {3, 5, 8}) {
+        uint64_t a_sl = n + uniform_u64_bits(2);
+        // poly 2 to multiply
+        znx_vec_i64_layout a(n, sa, a_sl);
+        a.fill_random(19);
+        // original operation result
+        fft64_vec_znx_dft_layout res(n, sr);
+        thash hash_a_before = a.content_hash();
+        thash hash_ppol_before = ppol.content_hash();
+        svp(module, res.data, sr, ppol.data, a.data(), sa, a_sl);
+        ASSERT_EQ(a.content_hash(), hash_a_before);
+        ASSERT_EQ(ppol.content_hash(), hash_ppol_before);
+        // create expected value
+        reim_fft64vec ppo = ppol.get_copy();
+        std::vector<reim_fft64vec> expect(sr);
+        for (uint64_t i = 0; i < sr; ++i) {
+          expect[i] = ppo * simple_fft64(a.get_copy_zext(i));
+        }
+        // this is the largest precision we can safely expect
+        double prec_expect = n * pow(2., 19 - 52);
+        for (uint64_t i = 0; i < sr; ++i) {
+          reim_fft64vec actual = res.get_copy_zext(i);
+          ASSERT_LE(infty_dist(actual, expect[i]), prec_expect);
+        }
+      }
+    }
+
+    delete_module_info(module);
+  }
+}
+
+TEST(fft64_svp_apply_dft, svp_apply_dft) { test_fft64_svp_apply_dft(svp_apply_dft); }
+TEST(fft64_svp_apply_dft, fft64_svp_apply_dft_ref) { test_fft64_svp_apply_dft(fft64_svp_apply_dft_ref); }
--- a/spqlios/lib/test/spqlios_test.cpp
+++ b/spqlios/lib/test/spqlios_test.cpp
@@ -0,0 +1,493 @@
+#include <cassert>
+#include <cinttypes>
+#include <cmath>
+#include <complex>
+#include <cstdint>
+#include <cstdlib>
+#include <iostream>
+#include <string>
+#include <vector>
+
+#include "gtest/gtest.h"
+#include "spqlios/cplx/cplx_fft_internal.h"
+
+using namespace std;
+
+/*namespace {
+bool very_close(const double& a, const double& b) {
+  bool reps = (abs(a - b) < 1e-5);
+  if (!reps) {
+    cerr << "not close: " << a << " vs. " << b << endl;
+  }
+  return reps;
+}
+
+}*/  // namespace
+
+TEST(fft, fftvec_convolution) {
+  uint64_t nn = 65536;           // vary accross (8192, 16384), 32768, 65536
+  static const uint64_t k = 18;  // vary from 10 to 20
+  // double* buf_fft = fft_precomp_get_buffer(tables, 0);
+  // double* buf_ifft = ifft_precomp_get_buffer(itables, 0);
+  double* a = (double*)spqlios_alloc_custom_align(32, nn * 8);
+  double* a2 = (double*)spqlios_alloc_custom_align(32, nn * 8);
+  double* b = (double*)spqlios_alloc_custom_align(32, nn * 8);
+  double* dist_vector = (double*)spqlios_alloc_custom_align(32, nn * 8);
+  int64_t p = UINT64_C(1) << k;
+  printf("p size: %" PRId64 "\n", p);
+  for (uint32_t i = 0; i < nn; i++) {
+    a[i] = (rand() % p) - p / 2;  // between -p/2 and p/2
+    b[i] = (rand() % p) - p / 2;
+    a2[i] = 0;
+  }
+  cplx_fft_simple(nn / 2, a);
+  cplx_fft_simple(nn / 2, b);
+  cplx_fftvec_addmul_simple(nn / 2, a2, a, b);
+  cplx_ifft_simple(nn / 2, a2);  // normalization is missing
+  double distance = 0;
+  // for (int32_t i = 0; i < 10; i++) {
+  //  printf("%lf %lf\n", a2[i], a2[i] / (nn / 2.));
+  //}
+  for (uint32_t i = 0; i < nn; i++) {
+    double curdist = fabs(a2[i] / (nn / 2.) - rint(a2[i] / (nn / 2.)));
+    if (distance < curdist) distance = curdist;
+    dist_vector[i] = a2[i] / (nn / 2.) - rint(a2[i] / (nn / 2.));
+  }
+  printf("distance: %lf\n", distance);
+  ASSERT_LE(distance, 0.5);  // switch from previous 0.1 to 0.5 per experiment 1 reqs
+  // double a3[] = {2,4,4,4,5,5,7,9}; //instead of dist_vector, for test
+  // nn = 8;
+  double mean = 0;
+  for (uint32_t i = 0; i < nn; i++) {
+    mean = mean + dist_vector[i];
+  }
+  mean = mean / nn;
+  double variance = 0;
+  for (uint32_t i = 0; i < nn; i++) {
+    variance = variance + pow((mean - dist_vector[i]), 2);
+  }
+  double stdev = sqrt(variance / nn);
+  printf("stdev: %lf\n", stdev);
+
+  spqlios_free(a);
+  spqlios_free(b);
+  spqlios_free(a2);
+  spqlios_free(dist_vector);
+}
+
+typedef double CPLX[2];
+EXPORT uint32_t revbits(uint32_t i, uint32_t v);
+
+void cplx_zero(CPLX r);
+void cplx_addmul(CPLX r, const CPLX a, const CPLX b);
+
+void halfcfft_eval(CPLX res, uint32_t nn, uint32_t k, const CPLX* coeffs, const CPLX* powomegas);
+void halfcfft_naive(uint32_t nn, CPLX* data);
+
+EXPORT void cplx_set(CPLX, const CPLX);
+EXPORT void citwiddle(CPLX, CPLX, const CPLX);
+EXPORT void invcitwiddle(CPLX, CPLX, const CPLX);
+EXPORT void ctwiddle(CPLX, CPLX, const CPLX);
+EXPORT void invctwiddle(CPLX, CPLX, const CPLX);
+
+#include "../spqlios/cplx/cplx_fft_private.h"
+#include "../spqlios/reim/reim_fft_internal.h"
+#include "../spqlios/reim/reim_fft_private.h"
+#include "../spqlios/reim4/reim4_fftvec_internal.h"
+#include "../spqlios/reim4/reim4_fftvec_private.h"
+
+TEST(fft, simple_fft_test) {  // test for checking the simple_fft api
+  uint64_t nn = 8;            // vary accross (8192, 16384), 32768, 65536
+  // double* buf_fft = fft_precomp_get_buffer(tables, 0);
+  // double* buf_ifft = ifft_precomp_get_buffer(itables, 0);
+
+  // define the complex coefficients of two polynomials mod X^4-i
+  double a[4][2] = {{1.1, 2.2}, {3.3, 4.4}, {5.5, 6.6}, {7.7, 8.8}};
+  double b[4][2] = {{9., 10.}, {11., 12.}, {13., 14.}, {15., 16.}};
+  double c[4][2];   // for the result
+  double a2[4][2];  // for testing inverse fft
+  memcpy(a2, a, 8 * nn);
+  cplx_fft_simple(4, a);
+  cplx_fft_simple(4, b);
+  cplx_fftvec_mul_simple(4, c, a, b);
+  cplx_ifft_simple(4, c);
+  // c contains the complex coefficients 4.a*b mod X^4-i
+  cplx_ifft_simple(4, a);
+
+  double distance = 0;
+  for (uint32_t i = 0; i < nn / 2; i++) {
+    double dist = fabs(a[i][0] / 4. - a2[i][0]);
+    if (distance < dist) distance = dist;
+    dist = fabs(a[i][1] / 4. - a2[i][1]);
+    if (distance < dist) distance = dist;
+  }
+  printf("distance: %lf\n", distance);
+  ASSERT_LE(distance, 0.1);  // switch from previous 0.1 to 0.5 per experiment 1 reqs
+
+  for (uint32_t i = 0; i < nn / 4; i++) {
+    printf("%lf %lf\n", a2[i][0], a[i][0] / (nn / 2.));
+    printf("%lf %lf\n", a2[i][1], a[i][1] / (nn / 2.));
+  }
+}
+
+TEST(fft, reim_test) {
+  // double a[16] __attribute__ ((aligned(32)))= {1.1,2.2,3.3,4.4,5.5,6.6,7.7,8.8,9.9,10.,11.,12.,13.,14.,15.,16.};
+  // double b[16] __attribute__ ((aligned(32)))= {17.,18.,19.,20.,21.,22.,23.,24.,25.,26.,27.,28.,29.,30., 31.,32.};
+  // double c[16] __attribute__ ((aligned(32))); // for the result in reference layout
+  double a[16] = {1.1, 2.2, 3.3, 4.4, 5.5, 6.6, 7.7, 8.8, 9.9, 10., 11., 12., 13., 14., 15., 16.};
+  double b[16] = {17., 18., 19., 20., 21., 22., 23., 24., 25., 26., 27., 28., 29., 30., 31., 32.};
+  double c[16];  // for the result in reference layout
+  reim_fft_simple(8, a);
+  reim_fft_simple(8, b);
+  reim_fftvec_mul_simple(8, c, a, b);
+  reim_ifft_simple(8, c);
+}
+
+TEST(fft, reim_vs_regular_layout_mul_test) {
+  uint64_t nn = 16;
+
+  // define the complex coefficients of two polynomials mod X^8-i
+
+  double a1[8][2] __attribute__((aligned(32))) = {{1.1, 2.2}, {3.3, 4.4}, {5.5, 6.6}, {7.7, 8.8},
+                                                  {9.9, 10.}, {11., 12.}, {13., 14.}, {15., 16.}};
+  double b1[8][2] __attribute__((aligned(32))) = {{17., 18.}, {19., 20.}, {21., 22.}, {23., 24.},
+                                                  {25., 26.}, {27., 28.}, {29., 30.}, {31., 32.}};
+  double c1[8][2] __attribute__((aligned(32)));  // for the result
+  double c2[16] __attribute__((aligned(32)));    // for the result
+  double c3[8][2] __attribute__((aligned(32)));  // for the result
+
+  double* a2 =
+      (double*)spqlios_alloc_custom_align(32, nn / 2 * 2 * sizeof(double));  // for storing the coefs in reim layout
+  double* b2 =
+      (double*)spqlios_alloc_custom_align(32, nn / 2 * 2 * sizeof(double));  // for storing the coefs in reim layout
+  // double* c2 = (double*)spqlios_alloc_custom_align(32, nn / 2 * sizeof(CPLX)); // for storing the coefs in reim
+  // layout
+
+  // organise the coefficients in the reim layout
+  for (uint32_t i = 0; i < nn / 2; i++) {
+    a2[i] = a1[i][0];  // a1 = a2, b1 = b2
+    a2[nn / 2 + i] = a1[i][1];
+    b2[i] = b1[i][0];
+    b2[nn / 2 + i] = b1[i][1];
+  }
+
+  // fft
+  cplx_fft_simple(8, a1);
+  reim_fft_simple(8, a2);
+
+  cplx_fft_simple(8, b1);
+  reim_fft_simple(8, b2);
+
+  cplx_fftvec_mul_simple(8, c1, a1, b1);
+  reim_fftvec_mul_simple(8, c2, a2, b2);
+
+  cplx_ifft_simple(8, c1);
+  reim_ifft_simple(8, c2);
+
+  // check base layout and reim layout result in the same values
+  double d = 0;
+  for (uint32_t i = 0; i < nn / 2; i++) {
+    // printf("RE: cplx_result %lf and reim_result %lf \n", c1[i][0], c2[i]);
+    // printf("IM: cplx_result %lf and reim_result %lf \n", c1[i][1], c2[nn / 2 + i]);
+    double dre = fabs(c1[i][0] - c2[i]);
+    double dim = fabs(c1[i][1] - c2[nn / 2 + i]);
+    if (dre > d) d = dre;
+    if (dim > d) d = dim;
+    ASSERT_LE(d, 1e-7);
+  }
+  ASSERT_LE(d, 1e-7);
+
+  // check converting back to base layout:
+
+  for (uint32_t i = 0; i < nn / 2; i++) {
+    c3[i][0] = c2[i];
+    c3[i][1] = c2[8 + i];
+  }
+
+  d = 0;
+  for (uint32_t i = 0; i < nn / 2; i++) {
+    double dre = fabs(c1[i][0] - c3[i][0]);
+    double dim = fabs(c1[i][1] - c3[i][1]);
+    if (dre > d) d = dre;
+    if (dim > d) d = dim;
+    ASSERT_LE(d, 1e-7);
+  }
+  ASSERT_LE(d, 1e-7);
+
+  spqlios_free(a2);
+  spqlios_free(b2);
+  // spqlios_free(c2);
+}
+
+TEST(fft, fftvec_convolution_recursiveoverk) {
+  static const uint64_t nn = 32768;  // vary accross (8192, 16384), 32768, 65536
+  double* a = (double*)spqlios_alloc_custom_align(32, nn * 8);
+  double* a2 = (double*)spqlios_alloc_custom_align(32, nn * 8);
+  double* b = (double*)spqlios_alloc_custom_align(32, nn * 8);
+  double* dist_vector = (double*)spqlios_alloc_custom_align(32, nn * 8);
+
+  printf("N size: %" PRId64 "\n", nn);
+
+  for (uint32_t k = 14; k <= 24; k++) {  // vary k
+    printf("k size: %" PRId32 "\n", k);
+    int64_t p = UINT64_C(1) << k;
+    for (uint32_t i = 0; i < nn; i++) {
+      a[i] = (rand() % p) - p / 2;
+      b[i] = (rand() % p) - p / 2;
+      a2[i] = 0;
+    }
+    cplx_fft_simple(nn / 2, a);
+    cplx_fft_simple(nn / 2, b);
+    cplx_fftvec_addmul_simple(nn / 2, a2, a, b);
+    cplx_ifft_simple(nn / 2, a2);
+    double distance = 0;
+    for (uint32_t i = 0; i < nn; i++) {
+      double curdist = fabs(a2[i] / (nn / 2.) - rint(a2[i] / (nn / 2.)));
+      if (distance < curdist) distance = curdist;
+      dist_vector[i] = a2[i] / (nn / 2.) - rint(a2[i] / (nn / 2.));
+    }
+    printf("distance: %lf\n", distance);
+    ASSERT_LE(distance, 0.5);  // switch from previous 0.1 to 0.5 per experiment 1 reqs
+    double mean = 0;
+    for (uint32_t i = 0; i < nn; i++) {
+      mean = mean + dist_vector[i];
+    }
+    mean = mean / nn;
+    double variance = 0;
+    for (uint32_t i = 0; i < nn; i++) {
+      variance = variance + pow((mean - dist_vector[i]), 2);
+    }
+    double stdev = sqrt(variance / nn);
+    printf("stdev: %lf\n", stdev);
+  }
+
+  spqlios_free(a);
+  spqlios_free(b);
+  spqlios_free(a2);
+  spqlios_free(dist_vector);
+}
+
+#ifdef __x86_64__
+TEST(fft, cplx_fft_ref_vs_fft_reim_ref) {
+  for (uint64_t nn : {16, 32, 64, 1024, 8192, 65536}) {
+    uint64_t m = nn / 2;
+    CPLX_FFT_PRECOMP* tables = new_cplx_fft_precomp(m, 0);
+    REIM_FFT_PRECOMP* reimtables = new_reim_fft_precomp(m, 0);
+    CPLX* a = (CPLX*)spqlios_alloc_custom_align(32, nn / 2 * sizeof(CPLX));
+    CPLX* a1 = (CPLX*)spqlios_alloc_custom_align(32, nn / 2 * sizeof(CPLX));
+    double* a2 = (double*)spqlios_alloc_custom_align(32, nn / 2 * sizeof(CPLX));
+    int64_t p = 1 << 16;
+    for (uint32_t i = 0; i < nn / 2; i++) {
+      a[i][0] = (rand() % p) - p / 2;  // between -p/2 and p/2
+      a[i][1] = (rand() % p) - p / 2;
+    }
+    memcpy(a1, a, nn / 2 * sizeof(CPLX));
+    for (uint32_t i = 0; i < nn / 2; i++) {
+      a2[i] = a[i][0];
+      a2[nn / 2 + i] = a[i][1];
+    }
+    cplx_fft_ref(tables, a1);
+    reim_fft_ref(reimtables, a2);
+    double d = 0;
+    for (uint32_t i = 0; i < nn / 2; i++) {
+      double dre = fabs(a1[i][0] - a2[i]);
+      double dim = fabs(a1[i][1] - a2[nn / 2 + i]);
+      if (dre > d) d = dre;
+      if (dim > d) d = dim;
+      ASSERT_LE(d, 1e-7);
+    }
+    ASSERT_LE(d, 1e-7);
+    spqlios_free(a);
+    spqlios_free(a1);
+    spqlios_free(a2);
+    delete_cplx_fft_precomp(tables);
+    delete_reim_fft_precomp(reimtables);
+  }
+}
+#endif
+
+TEST(fft, cplx_ifft_ref_vs_reim_ifft_ref) {
+  for (uint64_t nn : {16, 32, 64, 1024, 8192, 65536}) {
+    uint64_t m = nn / 2;
+    CPLX_IFFT_PRECOMP* tables = new_cplx_ifft_precomp(m, 0);
+    REIM_IFFT_PRECOMP* reimtables = new_reim_ifft_precomp(m, 0);
+    CPLX* a = (CPLX*)spqlios_alloc_custom_align(32, nn / 2 * sizeof(CPLX));
+    CPLX* a1 = (CPLX*)spqlios_alloc_custom_align(32, nn / 2 * sizeof(CPLX));
+    double* a2 = (double*)spqlios_alloc_custom_align(32, nn / 2 * sizeof(CPLX));
+    int64_t p = 1 << 16;
+    for (uint32_t i = 0; i < nn / 2; i++) {
+      a[i][0] = (rand() % p) - p / 2;  // between -p/2 and p/2
+      a[i][1] = (rand() % p) - p / 2;
+    }
+    memcpy(a1, a, nn / 2 * sizeof(CPLX));
+    for (uint32_t i = 0; i < nn / 2; i++) {
+      a2[i] = a[i][0];
+      a2[nn / 2 + i] = a[i][1];
+    }
+    cplx_ifft_ref(tables, a1);
+    reim_ifft_ref(reimtables, a2);
+    double d = 0;
+    for (uint32_t i = 0; i < nn / 2; i++) {
+      double dre = fabs(a1[i][0] - a2[i]);
+      double dim = fabs(a1[i][1] - a2[nn / 2 + i]);
+      if (dre > d) d = dre;
+      if (dim > d) d = dim;
+      ASSERT_LE(d, 1e-7);
+    }
+    ASSERT_LE(d, 1e-7);
+    spqlios_free(a);
+    spqlios_free(a1);
+    spqlios_free(a2);
+    delete_cplx_fft_precomp(tables);
+    delete_reim_fft_precomp(reimtables);
+  }
+}
+
+#ifdef __x86_64__
+TEST(fft, reim4_vecfft_addmul_fma_vs_ref) {
+  for (uint64_t nn : {16, 32, 64, 1024, 8192, 65536}) {
+    uint64_t m = nn / 2;
+    REIM4_FFTVEC_ADDMUL_PRECOMP* tbl = new_reim4_fftvec_addmul_precomp(m);
+    ASSERT_TRUE(tbl != nullptr);
+    double* a1 = (double*)spqlios_alloc_custom_align(32, nn / 2 * sizeof(CPLX));
+    double* a2 = (double*)spqlios_alloc_custom_align(32, nn / 2 * sizeof(CPLX));
+    double* b1 = (double*)spqlios_alloc_custom_align(32, nn / 2 * sizeof(CPLX));
+    double* b2 = (double*)spqlios_alloc_custom_align(32, nn / 2 * sizeof(CPLX));
+    double* r1 = (double*)spqlios_alloc_custom_align(32, nn / 2 * sizeof(CPLX));
+    double* r2 = (double*)spqlios_alloc_custom_align(32, nn / 2 * sizeof(CPLX));
+    int64_t p = 1 << 16;
+    for (uint32_t i = 0; i < nn; i++) {
+      a1[i] = (rand() % p) - p / 2;  // between -p/2 and p/2
+      b1[i] = (rand() % p) - p / 2;
+      r1[i] = (rand() % p) - p / 2;
+    }
+    memcpy(a2, a1, nn / 2 * sizeof(CPLX));
+    memcpy(b2, b1, nn / 2 * sizeof(CPLX));
+    memcpy(r2, r1, nn / 2 * sizeof(CPLX));
+    reim4_fftvec_addmul_ref(tbl, r1, a1, b1);
+    reim4_fftvec_addmul_fma(tbl, r2, a2, b2);
+    double d = 0;
+    for (uint32_t i = 0; i < nn; i++) {
+      double di = fabs(r1[i] - r2[i]);
+      if (di > d) d = di;
+      ASSERT_LE(d, 1e-8);
+    }
+    ASSERT_LE(d, 1e-8);
+    spqlios_free(a1);
+    spqlios_free(a2);
+    spqlios_free(b1);
+    spqlios_free(b2);
+    spqlios_free(r1);
+    spqlios_free(r2);
+    delete_reim4_fftvec_addmul_precomp(tbl);
+  }
+}
+#endif
+
+#ifdef __x86_64__
+TEST(fft, reim4_vecfft_mul_fma_vs_ref) {
+  for (uint64_t nn : {16, 32, 64, 1024, 8192, 65536}) {
+    uint64_t m = nn / 2;
+    REIM4_FFTVEC_MUL_PRECOMP* tbl = new_reim4_fftvec_mul_precomp(m);
+    double* a1 = (double*)spqlios_alloc_custom_align(32, nn / 2 * sizeof(CPLX));
+    double* a2 = (double*)spqlios_alloc_custom_align(32, nn / 2 * sizeof(CPLX));
+    double* b1 = (double*)spqlios_alloc_custom_align(32, nn / 2 * sizeof(CPLX));
+    double* b2 = (double*)spqlios_alloc_custom_align(32, nn / 2 * sizeof(CPLX));
+    double* r1 = (double*)spqlios_alloc_custom_align(32, nn / 2 * sizeof(CPLX));
+    double* r2 = (double*)spqlios_alloc_custom_align(32, nn / 2 * sizeof(CPLX));
+    int64_t p = 1 << 16;
+    for (uint32_t i = 0; i < nn; i++) {
+      a1[i] = (rand() % p) - p / 2;  // between -p/2 and p/2
+      b1[i] = (rand() % p) - p / 2;
+      r1[i] = (rand() % p) - p / 2;
+    }
+    memcpy(a2, a1, nn / 2 * sizeof(CPLX));
+    memcpy(b2, b1, nn / 2 * sizeof(CPLX));
+    memcpy(r2, r1, nn / 2 * sizeof(CPLX));
+    reim4_fftvec_mul_ref(tbl, r1, a1, b1);
+    reim4_fftvec_mul_fma(tbl, r2, a2, b2);
+    double d = 0;
+    for (uint32_t i = 0; i < nn; i++) {
+      double di = fabs(r1[i] - r2[i]);
+      if (di > d) d = di;
+      ASSERT_LE(d, 1e-8);
+    }
+    ASSERT_LE(d, 1e-8);
+    spqlios_free(a1);
+    spqlios_free(a2);
+    spqlios_free(b1);
+    spqlios_free(b2);
+    spqlios_free(r1);
+    spqlios_free(r2);
+    delete_reim_fftvec_mul_precomp(tbl);
+  }
+}
+#endif
+
+#ifdef __x86_64__
+TEST(fft, reim4_from_cplx_fma_vs_ref) {
+  for (uint64_t nn : {16, 32, 64, 1024, 8192, 65536}) {
+    uint64_t m = nn / 2;
+    REIM4_FROM_CPLX_PRECOMP* tbl = new_reim4_from_cplx_precomp(m);
+    double* a1 = (double*)spqlios_alloc_custom_align(32, nn / 2 * sizeof(CPLX));
+    double* a2 = (double*)spqlios_alloc_custom_align(32, nn / 2 * sizeof(CPLX));
+    double* r1 = (double*)spqlios_alloc_custom_align(32, nn / 2 * sizeof(CPLX));
+    double* r2 = (double*)spqlios_alloc_custom_align(32, nn / 2 * sizeof(CPLX));
+    int64_t p = 1 << 16;
+    for (uint32_t i = 0; i < nn; i++) {
+      a1[i] = (rand() % p) - p / 2;  // between -p/2 and p/2
+      r1[i] = (rand() % p) - p / 2;
+    }
+    memcpy(a2, a1, nn / 2 * sizeof(CPLX));
+    memcpy(r2, r1, nn / 2 * sizeof(CPLX));
+    reim4_from_cplx_ref(tbl, r1, a1);
+    reim4_from_cplx_fma(tbl, r2, a2);
+    double d = 0;
+    for (uint32_t i = 0; i < nn; i++) {
+      double di = fabs(r1[i] - r2[i]);
+      if (di > d) d = di;
+      ASSERT_LE(d, 1e-8);
+    }
+    ASSERT_LE(d, 1e-8);
+    spqlios_free(a1);
+    spqlios_free(a2);
+    spqlios_free(r1);
+    spqlios_free(r2);
+    delete_reim4_from_cplx_precomp(tbl);
+  }
+}
+#endif
+
+#ifdef __x86_64__
+TEST(fft, reim4_to_cplx_fma_vs_ref) {
+  for (uint64_t nn : {16, 32, 64, 1024, 8192, 65536}) {
+    uint64_t m = nn / 2;
+    REIM4_TO_CPLX_PRECOMP* tbl = new_reim4_to_cplx_precomp(m);
+    double* a1 = (double*)spqlios_alloc_custom_align(32, nn / 2 * sizeof(CPLX));
+    double* a2 = (double*)spqlios_alloc_custom_align(32, nn / 2 * sizeof(CPLX));
+    double* r1 = (double*)spqlios_alloc_custom_align(32, nn / 2 * sizeof(CPLX));
+    double* r2 = (double*)spqlios_alloc_custom_align(32, nn / 2 * sizeof(CPLX));
+    int64_t p = 1 << 16;
+    for (uint32_t i = 0; i < nn; i++) {
+      a1[i] = (rand() % p) - p / 2;  // between -p/2 and p/2
+      r1[i] = (rand() % p) - p / 2;
+    }
+    memcpy(a2, a1, nn / 2 * sizeof(CPLX));
+    memcpy(r2, r1, nn / 2 * sizeof(CPLX));
+    reim4_to_cplx_ref(tbl, r1, a1);
+    reim4_to_cplx_fma(tbl, r2, a2);
+    double d = 0;
+    for (uint32_t i = 0; i < nn; i++) {
+      double di = fabs(r1[i] - r2[i]);
+      if (di > d) d = di;
+      ASSERT_LE(d, 1e-8);
+    }
+    ASSERT_LE(d, 1e-8);
+    spqlios_free(a1);
+    spqlios_free(a2);
+    spqlios_free(r1);
+    spqlios_free(r2);
+    delete_reim4_from_cplx_precomp(tbl);
+  }
+}
+#endif
--- a/spqlios/lib/test/spqlios_vec_rnx_approxdecomp_tnxdbl_test.cpp
+++ b/spqlios/lib/test/spqlios_vec_rnx_approxdecomp_tnxdbl_test.cpp
@@ -0,0 +1,42 @@
+#include "gtest/gtest.h"
+#include "spqlios/arithmetic/vec_rnx_arithmetic_private.h"
+#include "testlib/vec_rnx_layout.h"
+
+static void test_rnx_approxdecomp(RNX_APPROXDECOMP_FROM_TNXDBL_F approxdec) {
+  for (const uint64_t nn : {2, 4, 8, 32}) {
+    MOD_RNX* module = new_rnx_module_info(nn, FFT64);
+    for (const uint64_t ell : {1, 2, 7}) {
+      for (const uint64_t k : {2, 5}) {
+        TNXDBL_APPROXDECOMP_GADGET* gadget = new_tnxdbl_approxdecomp_gadget(module, k, ell);
+        for (const uint64_t res_size : {ell, ell - 1, ell + 1}) {
+          const uint64_t res_sl = nn + uniform_u64_bits(2);
+          rnx_vec_f64_layout in(nn, 1, nn);
+          in.fill_random(3);
+          rnx_vec_f64_layout out(nn, res_size, res_sl);
+          approxdec(module, gadget, out.data(), res_size, res_sl, in.data());
+          // reconstruct the output
+          uint64_t msize = std::min(res_size, ell);
+          double err_bnd = msize == ell ? pow(2., -double(msize * k) - 1) : pow(2., -double(msize * k));
+          for (uint64_t j = 0; j < nn; ++j) {
+            double in_j = in.data()[j];
+            double out_j = 0;
+            for (uint64_t i = 0; i < res_size; ++i) {
+              out_j += out.get_copy(i).get_coeff(j) * pow(2., -double((i + 1) * k));
+            }
+            double err = out_j - in_j;
+            double err_abs = fabs(err - rint(err));
+            ASSERT_LE(err_abs, err_bnd);
+          }
+        }
+        delete_tnxdbl_approxdecomp_gadget(gadget);
+      }
+    }
+    delete_rnx_module_info(module);
+  }
+}
+
+TEST(vec_rnx, rnx_approxdecomp) { test_rnx_approxdecomp(rnx_approxdecomp_from_tnxdbl); }
+TEST(vec_rnx, rnx_approxdecomp_ref) { test_rnx_approxdecomp(rnx_approxdecomp_from_tnxdbl_ref); }
+#ifdef __x86_64__
+TEST(vec_rnx, rnx_approxdecomp_avx) { test_rnx_approxdecomp(rnx_approxdecomp_from_tnxdbl_avx); }
+#endif
--- a/spqlios/lib/test/spqlios_vec_rnx_conversions_test.cpp
+++ b/spqlios/lib/test/spqlios_vec_rnx_conversions_test.cpp
@@ -0,0 +1,134 @@
+#include <gtest/gtest.h>
+#include <spqlios/arithmetic/vec_rnx_arithmetic_private.h>
+
+#include "testlib/test_commons.h"
+
+template <typename SRC_T, typename DST_T>
+static void test_conv(void (*conv_f)(const MOD_RNX*,                                   //
+                                     DST_T* res, uint64_t res_size, uint64_t res_sl,   //
+                                     const SRC_T* a, uint64_t a_size, uint64_t a_sl),  //
+                      DST_T (*ideal_conv_f)(SRC_T x),                                  //
+                      SRC_T (*random_f)()                                              //
+) {
+  for (uint64_t nn : {2, 4, 16, 64}) {
+    MOD_RNX* module = new_rnx_module_info(nn, FFT64);
+    for (uint64_t a_size : {0, 1, 2, 5}) {
+      for (uint64_t res_size : {0, 1, 3, 5}) {
+        for (uint64_t trials = 0; trials < 20; ++trials) {
+          uint64_t a_sl = nn + uniform_u64_bits(2);
+          uint64_t res_sl = nn + uniform_u64_bits(2);
+          std::vector<SRC_T> a(a_sl * a_size);
+          std::vector<DST_T> res(res_sl * res_size);
+          uint64_t msize = std::min(a_size, res_size);
+          for (uint64_t i = 0; i < a_size; ++i) {
+            for (uint64_t j = 0; j < nn; ++j) {
+              a[i * a_sl + j] = random_f();
+            }
+          }
+          conv_f(module, res.data(), res_size, res_sl, a.data(), a_size, a_sl);
+          for (uint64_t i = 0; i < msize; ++i) {
+            for (uint64_t j = 0; j < nn; ++j) {
+              SRC_T aij = a[i * a_sl + j];
+              DST_T expect = ideal_conv_f(aij);
+              DST_T actual = res[i * res_sl + j];
+              ASSERT_EQ(expect, actual);
+            }
+          }
+          for (uint64_t i = msize; i < res_size; ++i) {
+            DST_T expect = 0;
+            for (uint64_t j = 0; j < nn; ++j) {
+              SRC_T actual = res[i * res_sl + j];
+              ASSERT_EQ(expect, actual);
+            }
+          }
+        }
+      }
+    }
+    delete_rnx_module_info(module);
+  }
+}
+
+static int32_t ideal_dbl_to_tn32(double a) {
+  double _2p32 = INT64_C(1) << 32;
+  double a_mod_1 = a - rint(a);
+  int64_t t = rint(a_mod_1 * _2p32);
+  return int32_t(t);
+}
+
+static double random_f64_10() { return uniform_f64_bounds(-10, 10); }
+
+static void test_vec_rnx_to_tnx32(VEC_RNX_TO_TNX32_F vec_rnx_to_tnx32_f) {
+  test_conv(vec_rnx_to_tnx32_f, ideal_dbl_to_tn32, random_f64_10);
+}
+
+TEST(vec_rnx_arithmetic, vec_rnx_to_tnx32) { test_vec_rnx_to_tnx32(vec_rnx_to_tnx32); }
+TEST(vec_rnx_arithmetic, vec_rnx_to_tnx32_ref) { test_vec_rnx_to_tnx32(vec_rnx_to_tnx32_ref); }
+
+static double ideal_tn32_to_dbl(int32_t a) {
+  const double _2p32 = INT64_C(1) << 32;
+  return double(a) / _2p32;
+}
+
+static int32_t random_t32() { return uniform_i64_bits(32); }
+
+static void test_vec_rnx_from_tnx32(VEC_RNX_FROM_TNX32_F vec_rnx_from_tnx32_f) {
+  test_conv(vec_rnx_from_tnx32_f, ideal_tn32_to_dbl, random_t32);
+}
+
+TEST(vec_rnx_arithmetic, vec_rnx_from_tnx32) { test_vec_rnx_from_tnx32(vec_rnx_from_tnx32); }
+TEST(vec_rnx_arithmetic, vec_rnx_from_tnx32_ref) { test_vec_rnx_from_tnx32(vec_rnx_from_tnx32_ref); }
+
+static int32_t ideal_dbl_round_to_i32(double a) { return int32_t(rint(a)); }
+
+static double random_dbl_explaw_18() { return uniform_f64_bounds(-1., 1.) * pow(2., uniform_u64_bits(6) % 19); }
+
+static void test_vec_rnx_to_znx32(VEC_RNX_TO_ZNX32_F vec_rnx_to_znx32_f) {
+  test_conv(vec_rnx_to_znx32_f, ideal_dbl_round_to_i32, random_dbl_explaw_18);
+}
+
+TEST(zn_arithmetic, vec_rnx_to_znx32) { test_vec_rnx_to_znx32(vec_rnx_to_znx32); }
+TEST(zn_arithmetic, vec_rnx_to_znx32_ref) { test_vec_rnx_to_znx32(vec_rnx_to_znx32_ref); }
+
+static double ideal_i32_to_dbl(int32_t a) { return double(a); }
+
+static int32_t random_i32_explaw_18() { return uniform_i64_bits(uniform_u64_bits(6) % 19); }
+
+static void test_vec_rnx_from_znx32(VEC_RNX_FROM_ZNX32_F vec_rnx_from_znx32_f) {
+  test_conv(vec_rnx_from_znx32_f, ideal_i32_to_dbl, random_i32_explaw_18);
+}
+
+TEST(zn_arithmetic, vec_rnx_from_znx32) { test_vec_rnx_from_znx32(vec_rnx_from_znx32); }
+TEST(zn_arithmetic, vec_rnx_from_znx32_ref) { test_vec_rnx_from_znx32(vec_rnx_from_znx32_ref); }
+
+static double ideal_dbl_to_tndbl(double a) { return a - rint(a); }
+
+static void test_vec_rnx_to_tnxdbl(VEC_RNX_TO_TNXDBL_F vec_rnx_to_tnxdbl_f) {
+  test_conv(vec_rnx_to_tnxdbl_f, ideal_dbl_to_tndbl, random_f64_10);
+}
+
+TEST(zn_arithmetic, vec_rnx_to_tnxdbl) { test_vec_rnx_to_tnxdbl(vec_rnx_to_tnxdbl); }
+TEST(zn_arithmetic, vec_rnx_to_tnxdbl_ref) { test_vec_rnx_to_tnxdbl(vec_rnx_to_tnxdbl_ref); }
+
+#if 0
+static int64_t ideal_dbl_round_to_i64(double a) { return rint(a); }
+
+static double random_dbl_explaw_50() { return uniform_f64_bounds(-1., 1.) * pow(2., uniform_u64_bits(7) % 51); }
+
+static void test_dbl_round_to_i64(DBL_ROUND_TO_I64_F dbl_round_to_i64_f) {
+  test_conv(dbl_round_to_i64_f, ideal_dbl_round_to_i64, random_dbl_explaw_50);
+}
+
+TEST(zn_arithmetic, dbl_round_to_i64) { test_dbl_round_to_i64(dbl_round_to_i64); }
+TEST(zn_arithmetic, dbl_round_to_i64_ref) { test_dbl_round_to_i64(dbl_round_to_i64_ref); }
+
+static double ideal_i64_to_dbl(int64_t a) { return double(a); }
+
+static int64_t random_i64_explaw_50() { return uniform_i64_bits(uniform_u64_bits(7) % 51); }
+
+static void test_i64_to_dbl(I64_TO_DBL_F i64_to_dbl_f) {
+  test_conv(i64_to_dbl_f, ideal_i64_to_dbl, random_i64_explaw_50);
+}
+
+TEST(zn_arithmetic, i64_to_dbl) { test_i64_to_dbl(i64_to_dbl); }
+TEST(zn_arithmetic, i64_to_dbl_ref) { test_i64_to_dbl(i64_to_dbl_ref); }
+#endif
--- a/spqlios/lib/test/spqlios_vec_rnx_ppol_test.cpp
+++ b/spqlios/lib/test/spqlios_vec_rnx_ppol_test.cpp
@@ -0,0 +1,73 @@
+#include <gtest/gtest.h>
+
+#include "spqlios/arithmetic/vec_rnx_arithmetic_private.h"
+#include "spqlios/reim/reim_fft.h"
+#include "test/testlib/vec_rnx_layout.h"
+
+static void test_vec_rnx_svp_prepare(RNX_SVP_PREPARE_F* rnx_svp_prepare, BYTES_OF_RNX_SVP_PPOL_F* tmp_bytes) {
+  for (uint64_t n : {2, 4, 8, 64}) {
+    MOD_RNX* mod = new_rnx_module_info(n, FFT64);
+    const double invm = 1. / mod->m;
+
+    rnx_f64 in = rnx_f64::random_log2bound(n, 40);
+    rnx_f64 in_divide_by_m = rnx_f64::zero(n);
+    for (uint64_t i = 0; i < n; ++i) {
+      in_divide_by_m.set_coeff(i, in.get_coeff(i) * invm);
+    }
+    fft64_rnx_svp_ppol_layout out(n);
+    reim_fft64vec expect = simple_fft64(in_divide_by_m);
+    rnx_svp_prepare(mod, out.data, in.data());
+    const double* ed = (double*)expect.data();
+    const double* ac = (double*)out.data;
+    for (uint64_t i = 0; i < n; ++i) {
+      ASSERT_LE(abs(ed[i] - ac[i]), 1e-10) << i << n;
+    }
+    delete_rnx_module_info(mod);
+  }
+}
+TEST(vec_rnx, vec_rnx_svp_prepare) { test_vec_rnx_svp_prepare(rnx_svp_prepare, bytes_of_rnx_svp_ppol); }
+TEST(vec_rnx, vec_rnx_svp_prepare_ref) {
+  test_vec_rnx_svp_prepare(fft64_rnx_svp_prepare_ref, fft64_bytes_of_rnx_svp_ppol);
+}
+
+static void test_vec_rnx_svp_apply(RNX_SVP_APPLY_F* apply) {
+  for (uint64_t n : {2, 4, 8, 64, 128}) {
+    MOD_RNX* mod = new_rnx_module_info(n, FFT64);
+
+    // poly 1 to multiply - create and prepare
+    fft64_rnx_svp_ppol_layout ppol(n);
+    ppol.fill_random(1.);
+    for (uint64_t sa : {3, 5, 8}) {
+      for (uint64_t sr : {3, 5, 8}) {
+        uint64_t a_sl = n + uniform_u64_bits(2);
+        uint64_t r_sl = n + uniform_u64_bits(2);
+        // poly 2 to multiply
+        rnx_vec_f64_layout a(n, sa, a_sl);
+        a.fill_random(19);
+
+        // original operation result
+        rnx_vec_f64_layout res(n, sr, r_sl);
+        thash hash_a_before = a.content_hash();
+        thash hash_ppol_before = ppol.content_hash();
+        apply(mod, res.data(), sr, r_sl, ppol.data, a.data(), sa, a_sl);
+        ASSERT_EQ(a.content_hash(), hash_a_before);
+        ASSERT_EQ(ppol.content_hash(), hash_ppol_before);
+        // create expected value
+        reim_fft64vec ppo = ppol.get_copy();
+        std::vector<rnx_f64> expect(sr);
+        for (uint64_t i = 0; i < sr; ++i) {
+          expect[i] = simple_ifft64(ppo * simple_fft64(a.get_copy_zext(i)));
+        }
+        // this is the largest precision we can safely expect
+        double prec_expect = n * pow(2., 19 - 50);
+        for (uint64_t i = 0; i < sr; ++i) {
+          rnx_f64 actual = res.get_copy_zext(i);
+          ASSERT_LE(infty_dist(actual, expect[i]), prec_expect);
+        }
+      }
+    }
+    delete_rnx_module_info(mod);
+  }
+}
+TEST(vec_rnx, vec_rnx_svp_apply) { test_vec_rnx_svp_apply(rnx_svp_apply); }
+TEST(vec_rnx, vec_rnx_svp_apply_ref) { test_vec_rnx_svp_apply(fft64_rnx_svp_apply_ref); }
--- a/spqlios/lib/test/spqlios_vec_rnx_test.cpp
+++ b/spqlios/lib/test/spqlios_vec_rnx_test.cpp
@@ -0,0 +1,417 @@
+#include <gtest/gtest.h>
+
+#include "spqlios/arithmetic/vec_rnx_arithmetic_private.h"
+#include "spqlios/reim/reim_fft.h"
+#include "testlib/vec_rnx_layout.h"
+
+// disabling this test by default, since it depicts on purpose wrong accesses
+#if 0
+TEST(rnx_layout, valgrind_antipattern_test) {
+  uint64_t n = 4;
+  rnx_vec_f64_layout v(n, 7, 13);
+  // this should be ok
+  v.set(0, rnx_f64::zero(n));
+  // this should abort (wrong ring dimension)
+  ASSERT_DEATH(v.set(3, rnx_f64::zero(2 * n)), "");
+  // this should abort (out of bounds)
+  ASSERT_DEATH(v.set(8, rnx_f64::zero(n)), "");
+  // this should be ok
+  ASSERT_EQ(v.get_copy_zext(0), rnx_f64::zero(n));
+  // should be an uninit read
+  ASSERT_TRUE(!(v.get_copy_zext(2) == rnx_f64::zero(n)));  // should be uninit
+  // should be an invalid read (inter-slice)
+  ASSERT_NE(v.data()[4], 0);
+  ASSERT_EQ(v.data()[2], 0);  // should be ok
+  // should be an uninit read
+  ASSERT_NE(v.data()[13], 0);  // should be uninit
+}
+#endif
+
+// test of binary operations
+
+// test for out of place calls
+template <typename ACTUAL_FCN, typename EXPECT_FCN>
+void test_vec_rnx_elemw_binop_outplace(ACTUAL_FCN binop, EXPECT_FCN ref_binop) {
+  for (uint64_t n : {2, 4, 8, 128}) {
+    RNX_MODULE_TYPE mtype = FFT64;
+    MOD_RNX* mod = new_rnx_module_info(n, mtype);
+    for (uint64_t sa : {7, 13, 15}) {
+      for (uint64_t sb : {7, 13, 15}) {
+        for (uint64_t sc : {7, 13, 15}) {
+          uint64_t a_sl = uniform_u64_bits(3) * 5 + n;
+          uint64_t b_sl = uniform_u64_bits(3) * 5 + n;
+          uint64_t c_sl = uniform_u64_bits(3) * 5 + n;
+          rnx_vec_f64_layout la(n, sa, a_sl);
+          rnx_vec_f64_layout lb(n, sb, b_sl);
+          rnx_vec_f64_layout lc(n, sc, c_sl);
+          std::vector<rnx_f64> expect(sc);
+          for (uint64_t i = 0; i < sa; ++i) {
+            la.set(i, rnx_f64::random_log2bound(n, 1.));
+          }
+          for (uint64_t i = 0; i < sb; ++i) {
+            lb.set(i, rnx_f64::random_log2bound(n, 1.));
+          }
+          for (uint64_t i = 0; i < sc; ++i) {
+            expect[i] = ref_binop(la.get_copy_zext(i), lb.get_copy_zext(i));
+          }
+          binop(mod,                  // N
+                lc.data(), sc, c_sl,  // res
+                la.data(), sa, a_sl,  // a
+                lb.data(), sb, b_sl);
+          for (uint64_t i = 0; i < sc; ++i) {
+            ASSERT_EQ(lc.get_copy_zext(i), expect[i]);
+          }
+        }
+      }
+    }
+    delete_rnx_module_info(mod);
+  }
+}
+// test for inplace1 calls
+template <typename ACTUAL_FCN, typename EXPECT_FCN>
+void test_vec_rnx_elemw_binop_inplace1(ACTUAL_FCN binop, EXPECT_FCN ref_binop) {
+  for (uint64_t n : {2, 4, 64}) {
+    RNX_MODULE_TYPE mtype = FFT64;
+    MOD_RNX* mod = new_rnx_module_info(n, mtype);
+    for (uint64_t sa : {3, 9, 12}) {
+      for (uint64_t sb : {3, 9, 12}) {
+        uint64_t a_sl = uniform_u64_bits(3) * 5 + n;
+        uint64_t b_sl = uniform_u64_bits(3) * 5 + n;
+        rnx_vec_f64_layout la(n, sa, a_sl);
+        rnx_vec_f64_layout lb(n, sb, b_sl);
+        std::vector<rnx_f64> expect(sa);
+        for (uint64_t i = 0; i < sa; ++i) {
+          la.set(i, rnx_f64::random_log2bound(n, 1.));
+        }
+        for (uint64_t i = 0; i < sb; ++i) {
+          lb.set(i, rnx_f64::random_log2bound(n, 1.));
+        }
+        for (uint64_t i = 0; i < sa; ++i) {
+          expect[i] = ref_binop(la.get_copy_zext(i), lb.get_copy_zext(i));
+        }
+        binop(mod,                  // N
+              la.data(), sa, a_sl,  // res
+              la.data(), sa, a_sl,  // a
+              lb.data(), sb, b_sl);
+        for (uint64_t i = 0; i < sa; ++i) {
+          ASSERT_EQ(la.get_copy_zext(i), expect[i]);
+        }
+      }
+    }
+    delete_rnx_module_info(mod);
+  }
+}
+// test for inplace2 calls
+template <typename ACTUAL_FCN, typename EXPECT_FCN>
+void test_vec_rnx_elemw_binop_inplace2(ACTUAL_FCN binop, EXPECT_FCN ref_binop) {
+  for (uint64_t n : {4, 32, 64}) {
+    RNX_MODULE_TYPE mtype = FFT64;
+    MOD_RNX* mod = new_rnx_module_info(n, mtype);
+    for (uint64_t sa : {3, 9, 12}) {
+      for (uint64_t sb : {3, 9, 12}) {
+        uint64_t a_sl = uniform_u64_bits(3) * 5 + n;
+        uint64_t b_sl = uniform_u64_bits(3) * 5 + n;
+        rnx_vec_f64_layout la(n, sa, a_sl);
+        rnx_vec_f64_layout lb(n, sb, b_sl);
+        std::vector<rnx_f64> expect(sb);
+        for (uint64_t i = 0; i < sa; ++i) {
+          la.set(i, rnx_f64::random_log2bound(n, 1.));
+        }
+        for (uint64_t i = 0; i < sb; ++i) {
+          lb.set(i, rnx_f64::random_log2bound(n, 1.));
+        }
+        for (uint64_t i = 0; i < sb; ++i) {
+          expect[i] = ref_binop(la.get_copy_zext(i), lb.get_copy_zext(i));
+        }
+        binop(mod,                  // N
+              lb.data(), sb, b_sl,  // res
+              la.data(), sa, a_sl,  // a
+              lb.data(), sb, b_sl);
+        for (uint64_t i = 0; i < sb; ++i) {
+          ASSERT_EQ(lb.get_copy_zext(i), expect[i]);
+        }
+      }
+    }
+    delete_rnx_module_info(mod);
+  }
+}
+// test for inplace3 calls
+template <typename ACTUAL_FCN, typename EXPECT_FCN>
+void test_vec_rnx_elemw_binop_inplace3(ACTUAL_FCN binop, EXPECT_FCN ref_binop) {
+  for (uint64_t n : {2, 16, 1024}) {
+    RNX_MODULE_TYPE mtype = FFT64;
+    MOD_RNX* mod = new_rnx_module_info(n, mtype);
+    for (uint64_t sa : {2, 6, 11}) {
+      uint64_t a_sl = uniform_u64_bits(3) * 5 + n;
+      rnx_vec_f64_layout la(n, sa, a_sl);
+      std::vector<rnx_f64> expect(sa);
+      for (uint64_t i = 0; i < sa; ++i) {
+        la.set(i, rnx_f64::random_log2bound(n, 1.));
+      }
+      for (uint64_t i = 0; i < sa; ++i) {
+        expect[i] = ref_binop(la.get_copy_zext(i), la.get_copy_zext(i));
+      }
+      binop(mod,                  // N
+            la.data(), sa, a_sl,  // res
+            la.data(), sa, a_sl,  // a
+            la.data(), sa, a_sl);
+      for (uint64_t i = 0; i < sa; ++i) {
+        ASSERT_EQ(la.get_copy_zext(i), expect[i]);
+      }
+    }
+    delete_rnx_module_info(mod);
+  }
+}
+template <typename ACTUAL_FCN, typename EXPECT_FCN>
+void test_vec_rnx_elemw_binop(ACTUAL_FCN binop, EXPECT_FCN ref_binop) {
+  test_vec_rnx_elemw_binop_outplace(binop, ref_binop);
+  test_vec_rnx_elemw_binop_inplace1(binop, ref_binop);
+  test_vec_rnx_elemw_binop_inplace2(binop, ref_binop);
+  test_vec_rnx_elemw_binop_inplace3(binop, ref_binop);
+}
+
+static rnx_f64 poly_add(const rnx_f64& a, const rnx_f64& b) { return a + b; }
+static rnx_f64 poly_sub(const rnx_f64& a, const rnx_f64& b) { return a - b; }
+TEST(vec_rnx, vec_rnx_add) { test_vec_rnx_elemw_binop(vec_rnx_add, poly_add); }
+TEST(vec_rnx, vec_rnx_add_ref) { test_vec_rnx_elemw_binop(vec_rnx_add_ref, poly_add); }
+#ifdef __x86_64__
+TEST(vec_rnx, vec_rnx_add_avx) { test_vec_rnx_elemw_binop(vec_rnx_add_avx, poly_add); }
+#endif
+TEST(vec_rnx, vec_rnx_sub) { test_vec_rnx_elemw_binop(vec_rnx_sub, poly_sub); }
+TEST(vec_rnx, vec_rnx_sub_ref) { test_vec_rnx_elemw_binop(vec_rnx_sub_ref, poly_sub); }
+#ifdef __x86_64__
+TEST(vec_rnx, vec_rnx_sub_avx) { test_vec_rnx_elemw_binop(vec_rnx_sub_avx, poly_sub); }
+#endif
+
+// test for out of place calls
+template <typename ACTUAL_FCN, typename EXPECT_FCN>
+void test_vec_rnx_elemw_unop_param_outplace(ACTUAL_FCN test_mul_xp_minus_one, EXPECT_FCN ref_mul_xp_minus_one,
+                                            int64_t (*param_gen)()) {
+  for (uint64_t n : {2, 4, 8, 128}) {
+    MOD_RNX* mod = new_rnx_module_info(n, FFT64);
+    for (uint64_t sa : {7, 13, 15}) {
+      for (uint64_t sb : {7, 13, 15}) {
+        {
+          int64_t p = param_gen();
+          uint64_t a_sl = uniform_u64_bits(3) * 5 + n;
+          uint64_t b_sl = uniform_u64_bits(3) * 4 + n;
+          rnx_vec_f64_layout la(n, sa, a_sl);
+          rnx_vec_f64_layout lb(n, sb, b_sl);
+          std::vector<rnx_f64> expect(sb);
+          for (uint64_t i = 0; i < sa; ++i) {
+            la.set(i, rnx_f64::random_log2bound(n, 0));
+          }
+          for (uint64_t i = 0; i < sb; ++i) {
+            expect[i] = ref_mul_xp_minus_one(p, la.get_copy_zext(i));
+          }
+          test_mul_xp_minus_one(mod,                  //
+                                p,                    //
+                                lb.data(), sb, b_sl,  //
+                                la.data(), sa, a_sl   //
+          );
+          for (uint64_t i = 0; i < sb; ++i) {
+            ASSERT_EQ(lb.get_copy_zext(i), expect[i]) << n << " " << sa << " " << sb << " " << i;
+          }
+        }
+      }
+    }
+    delete_rnx_module_info(mod);
+  }
+}
+
+// test for inplace calls
+template <typename ACTUAL_FCN, typename EXPECT_FCN>
+void test_vec_rnx_elemw_unop_param_inplace(ACTUAL_FCN actual_function, EXPECT_FCN ref_function,
+                                           int64_t (*param_gen)()) {
+  for (uint64_t n : {2, 16, 1024}) {
+    MOD_RNX* mod = new_rnx_module_info(n, FFT64);
+    for (uint64_t sa : {2, 6, 11}) {
+      {
+        int64_t p = param_gen();
+        uint64_t a_sl = uniform_u64_bits(3) * 5 + n;
+        rnx_vec_f64_layout la(n, sa, a_sl);
+        std::vector<rnx_f64> expect(sa);
+        for (uint64_t i = 0; i < sa; ++i) {
+          la.set(i, rnx_f64::random_log2bound(n, 0));
+        }
+        for (uint64_t i = 0; i < sa; ++i) {
+          expect[i] = ref_function(p, la.get_copy_zext(i));
+        }
+        actual_function(mod,                  // N
+                        p,                    //;
+                              la.data(), sa, a_sl,  // res
+                              la.data(), sa, a_sl   // a
+        );
+        for (uint64_t i = 0; i < sa; ++i) {
+          ASSERT_EQ(la.get_copy_zext(i), expect[i]) << n << " " << sa << " " << i;
+        }
+      }
+    }
+    delete_rnx_module_info(mod);
+  }
+}
+
+static int64_t random_mul_xp_minus_one_param() { return uniform_i64(); }
+static int64_t random_automorphism_param() { return 2 * uniform_i64() + 1; }
+static int64_t random_rotation_param() { return uniform_i64(); }
+
+template <typename ACTUAL_FCN, typename EXPECT_FCN>
+void test_vec_rnx_elemw_mul_xp_minus_one(ACTUAL_FCN binop, EXPECT_FCN ref_binop) {
+  test_vec_rnx_elemw_unop_param_outplace(binop, ref_binop, random_mul_xp_minus_one_param);
+  test_vec_rnx_elemw_unop_param_inplace(binop, ref_binop, random_mul_xp_minus_one_param);
+}
+template <typename ACTUAL_FCN, typename EXPECT_FCN>
+void test_vec_rnx_elemw_rotate(ACTUAL_FCN binop, EXPECT_FCN ref_binop) {
+  test_vec_rnx_elemw_unop_param_outplace(binop, ref_binop, random_rotation_param);
+  test_vec_rnx_elemw_unop_param_inplace(binop, ref_binop, random_rotation_param);
+}
+template <typename ACTUAL_FCN, typename EXPECT_FCN>
+void test_vec_rnx_elemw_automorphism(ACTUAL_FCN binop, EXPECT_FCN ref_binop) {
+  test_vec_rnx_elemw_unop_param_outplace(binop, ref_binop, random_automorphism_param);
+  test_vec_rnx_elemw_unop_param_inplace(binop, ref_binop, random_automorphism_param);
+}
+
+static rnx_f64 poly_mul_xp_minus_one(const int64_t p, const rnx_f64& a) {
+  uint64_t n = a.nn();
+  rnx_f64 res(n);
+  for (uint64_t i = 0; i < n; ++i) {
+    res.set_coeff(i, a.get_coeff(i - p) - a.get_coeff(i));
+  }
+  return res;
+}
+static rnx_f64 poly_rotate(const int64_t p, const rnx_f64& a) {
+  uint64_t n = a.nn();
+  rnx_f64 res(n);
+  for (uint64_t i = 0; i < n; ++i) {
+    res.set_coeff(i, a.get_coeff(i - p));
+  }
+  return res;
+}
+static rnx_f64 poly_automorphism(const int64_t p, const rnx_f64& a) {
+  uint64_t n = a.nn();
+  rnx_f64 res(n);
+  for (uint64_t i = 0; i < n; ++i) {
+    res.set_coeff(i * p, a.get_coeff(i));
+  }
+  return res;
+}
+
+TEST(vec_rnx, vec_rnx_mul_xp_minus_one) {
+  test_vec_rnx_elemw_mul_xp_minus_one(vec_rnx_mul_xp_minus_one, poly_mul_xp_minus_one);
+}
+TEST(vec_rnx, vec_rnx_mul_xp_minus_one_ref) {
+  test_vec_rnx_elemw_mul_xp_minus_one(vec_rnx_mul_xp_minus_one_ref, poly_mul_xp_minus_one);
+}
+
+TEST(vec_rnx, vec_rnx_rotate) { test_vec_rnx_elemw_rotate(vec_rnx_rotate, poly_rotate); }
+TEST(vec_rnx, vec_rnx_rotate_ref) { test_vec_rnx_elemw_rotate(vec_rnx_rotate_ref, poly_rotate); }
+TEST(vec_rnx, vec_rnx_automorphism) { test_vec_rnx_elemw_automorphism(vec_rnx_automorphism, poly_automorphism); }
+TEST(vec_rnx, vec_rnx_automorphism_ref) {
+  test_vec_rnx_elemw_automorphism(vec_rnx_automorphism_ref, poly_automorphism);
+}
+
+// test for out of place calls
+template <typename ACTUAL_FCN, typename EXPECT_FCN>
+void test_vec_rnx_elemw_unop_outplace(ACTUAL_FCN actual_function, EXPECT_FCN ref_function) {
+  for (uint64_t n : {2, 4, 8, 128}) {
+    MOD_RNX* mod = new_rnx_module_info(n, FFT64);
+    for (uint64_t sa : {7, 13, 15}) {
+      for (uint64_t sb : {7, 13, 15}) {
+        {
+          uint64_t a_sl = uniform_u64_bits(3) * 5 + n;
+          uint64_t b_sl = uniform_u64_bits(3) * 4 + n;
+          rnx_vec_f64_layout la(n, sa, a_sl);
+          rnx_vec_f64_layout lb(n, sb, b_sl);
+          std::vector<rnx_f64> expect(sb);
+          for (uint64_t i = 0; i < sa; ++i) {
+            la.set(i, rnx_f64::random_log2bound(n, 0));
+          }
+          for (uint64_t i = 0; i < sb; ++i) {
+            expect[i] = ref_function(la.get_copy_zext(i));
+          }
+          actual_function(mod,                  //
+                          lb.data(), sb, b_sl,  //
+                          la.data(), sa, a_sl   //
+          );
+          for (uint64_t i = 0; i < sb; ++i) {
+            ASSERT_EQ(lb.get_copy_zext(i), expect[i]) << n << " " << sa << " " << sb << " " << i;
+          }
+        }
+      }
+    }
+    delete_rnx_module_info(mod);
+  }
+}
+
+// test for inplace calls
+template <typename ACTUAL_FCN, typename EXPECT_FCN>
+void test_vec_rnx_elemw_unop_inplace(ACTUAL_FCN actual_function, EXPECT_FCN ref_function) {
+  for (uint64_t n : {2, 16, 1024}) {
+    MOD_RNX* mod = new_rnx_module_info(n, FFT64);
+    for (uint64_t sa : {2, 6, 11}) {
+      {
+        uint64_t a_sl = uniform_u64_bits(3) * 5 + n;
+        rnx_vec_f64_layout la(n, sa, a_sl);
+        std::vector<rnx_f64> expect(sa);
+        for (uint64_t i = 0; i < sa; ++i) {
+          la.set(i, rnx_f64::random_log2bound(n, 0));
+        }
+        for (uint64_t i = 0; i < sa; ++i) {
+          expect[i] = ref_function(la.get_copy_zext(i));
+        }
+        actual_function(mod,                  // N
+                        la.data(), sa, a_sl,  // res
+                        la.data(), sa, a_sl   // a
+        );
+        for (uint64_t i = 0; i < sa; ++i) {
+          ASSERT_EQ(la.get_copy_zext(i), expect[i]) << n << " " << sa << " " << i;
+        }
+      }
+    }
+    delete_rnx_module_info(mod);
+  }
+}
+template <typename ACTUAL_FCN, typename EXPECT_FCN>
+void test_vec_rnx_elemw_unop(ACTUAL_FCN unnop, EXPECT_FCN ref_unnop) {
+  test_vec_rnx_elemw_unop_outplace(unnop, ref_unnop);
+  test_vec_rnx_elemw_unop_inplace(unnop, ref_unnop);
+}
+
+static rnx_f64 poly_copy(const rnx_f64& a) { return a; }
+static rnx_f64 poly_negate(const rnx_f64& a) { return -a; }
+
+TEST(vec_rnx, vec_rnx_copy) { test_vec_rnx_elemw_unop(vec_rnx_copy, poly_copy); }
+TEST(vec_rnx, vec_rnx_copy_ref) { test_vec_rnx_elemw_unop(vec_rnx_copy_ref, poly_copy); }
+TEST(vec_rnx, vec_rnx_negate) { test_vec_rnx_elemw_unop(vec_rnx_negate, poly_negate); }
+TEST(vec_rnx, vec_rnx_negate_ref) { test_vec_rnx_elemw_unop(vec_rnx_negate_ref, poly_negate); }
+#ifdef __x86_64__
+TEST(vec_rnx, vec_rnx_negate_avx) { test_vec_rnx_elemw_unop(vec_rnx_negate_avx, poly_negate); }
+#endif
+
+// test for inplace calls
+void test_vec_rnx_zero(VEC_RNX_ZERO_F actual_function) {
+  for (uint64_t n : {2, 16, 1024}) {
+    MOD_RNX* mod = new_rnx_module_info(n, FFT64);
+    for (uint64_t sa : {2, 6, 11}) {
+      {
+        uint64_t a_sl = uniform_u64_bits(3) * 5 + n;
+        rnx_vec_f64_layout la(n, sa, a_sl);
+        const rnx_f64 ZERO = rnx_f64::zero(n);
+        for (uint64_t i = 0; i < sa; ++i) {
+          la.set(i, rnx_f64::random_log2bound(n, 0));
+        }
+        actual_function(mod,                 // N
+                        la.data(), sa, a_sl  // res
+        );
+        for (uint64_t i = 0; i < sa; ++i) {
+          ASSERT_EQ(la.get_copy_zext(i), ZERO) << n << " " << sa << " " << i;
+        }
+      }
+    }
+    delete_rnx_module_info(mod);
+  }
+}
+
+TEST(vec_rnx, vec_rnx_zero) { test_vec_rnx_zero(vec_rnx_zero); }
+
+TEST(vec_rnx, vec_rnx_zero_ref) { test_vec_rnx_zero(vec_rnx_zero_ref); }
--- a/spqlios/lib/test/spqlios_vec_rnx_vmp_test.cpp
+++ b/spqlios/lib/test/spqlios_vec_rnx_vmp_test.cpp
@@ -0,0 +1,291 @@
+#include "gtest/gtest.h"
+#include "../spqlios/arithmetic/vec_rnx_arithmetic_private.h"
+#include "../spqlios/reim/reim_fft.h"
+#include "testlib/vec_rnx_layout.h"
+
+static void test_vmp_apply_dft_to_dft_outplace(  //
+    RNX_VMP_APPLY_DFT_TO_DFT_F* apply,           //
+    RNX_VMP_APPLY_DFT_TO_DFT_TMP_BYTES_F* tmp_bytes) {
+  for (uint64_t nn : {2, 4, 8, 64}) {
+    MOD_RNX* module = new_rnx_module_info(nn, FFT64);
+    for (uint64_t mat_nrows : {1, 4, 7}) {
+      for (uint64_t mat_ncols : {1, 2, 5}) {
+        for (uint64_t in_size : {1, 4, 7}) {
+          for (uint64_t out_size : {1, 2, 5}) {
+            const uint64_t in_sl = nn + uniform_u64_bits(2);
+            const uint64_t out_sl = nn + uniform_u64_bits(2);
+            rnx_vec_f64_layout in(nn, in_size, in_sl);
+            fft64_rnx_vmp_pmat_layout pmat(nn, mat_nrows, mat_ncols);
+            rnx_vec_f64_layout out(nn, out_size, out_sl);
+            in.fill_random(0);
+            pmat.fill_random(0);
+            // naive computation of the product
+            std::vector<reim_fft64vec> expect(out_size, reim_fft64vec(nn));
+            for (uint64_t col = 0; col < out_size; ++col) {
+              reim_fft64vec ex = reim_fft64vec::zero(nn);
+              for (uint64_t row = 0; row < std::min(mat_nrows, in_size); ++row) {
+                ex += pmat.get_zext(row, col) * in.get_dft_copy(row);
+              }
+              expect[col] = ex;
+            }
+            // apply the product
+            std::vector<uint8_t> tmp(tmp_bytes(module, out_size, in_size, mat_nrows, mat_ncols));
+            apply(module,                           //
+                  out.data(), out_size, out_sl,     //
+                  in.data(), in_size, in_sl,        //
+                  pmat.data, mat_nrows, mat_ncols,  //
+                  tmp.data());
+            // check that the output is close from the expectation
+            for (uint64_t col = 0; col < out_size; ++col) {
+              reim_fft64vec actual = out.get_dft_copy_zext(col);
+              ASSERT_LE(infty_dist(actual, expect[col]), 1e-10);
+            }
+          }
+        }
+      }
+    }
+    delete_rnx_module_info(module);
+  }
+}
+
+static void test_vmp_apply_dft_to_dft_inplace(  //
+    RNX_VMP_APPLY_DFT_TO_DFT_F* apply,          //
+    RNX_VMP_APPLY_DFT_TO_DFT_TMP_BYTES_F* tmp_bytes) {
+  for (uint64_t nn : {2, 4, 8, 64}) {
+    MOD_RNX* module = new_rnx_module_info(nn, FFT64);
+    for (uint64_t mat_nrows : {1, 2, 6}) {
+      for (uint64_t mat_ncols : {1, 2, 7, 8}) {
+        for (uint64_t in_size : {1, 3, 6}) {
+          for (uint64_t out_size : {1, 3, 6}) {
+            const uint64_t in_out_sl = nn + uniform_u64_bits(2);
+            rnx_vec_f64_layout in_out(nn, std::max(in_size, out_size), in_out_sl);
+            fft64_rnx_vmp_pmat_layout pmat(nn, mat_nrows, mat_ncols);
+            in_out.fill_random(0);
+            pmat.fill_random(0);
+            // naive computation of the product
+            std::vector<reim_fft64vec> expect(out_size, reim_fft64vec(nn));
+            for (uint64_t col = 0; col < out_size; ++col) {
+              reim_fft64vec ex = reim_fft64vec::zero(nn);
+              for (uint64_t row = 0; row < std::min(mat_nrows, in_size); ++row) {
+                ex += pmat.get_zext(row, col) * in_out.get_dft_copy(row);
+              }
+              expect[col] = ex;
+            }
+            // apply the product
+            std::vector<uint8_t> tmp(tmp_bytes(module, out_size, in_size, mat_nrows, mat_ncols));
+            apply(module,                              //
+                  in_out.data(), out_size, in_out_sl,  //
+                  in_out.data(), in_size, in_out_sl,   //
+                  pmat.data, mat_nrows, mat_ncols,     //
+                  tmp.data());
+            // check that the output is close from the expectation
+            for (uint64_t col = 0; col < out_size; ++col) {
+              reim_fft64vec actual = in_out.get_dft_copy_zext(col);
+              ASSERT_LE(infty_dist(actual, expect[col]), 1e-10);
+            }
+          }
+        }
+      }
+    }
+    delete_rnx_module_info(module);
+  }
+}
+
+static void test_vmp_apply_dft_to_dft(  //
+    RNX_VMP_APPLY_DFT_TO_DFT_F* apply,  //
+    RNX_VMP_APPLY_DFT_TO_DFT_TMP_BYTES_F* tmp_bytes) {
+  test_vmp_apply_dft_to_dft_outplace(apply, tmp_bytes);
+  test_vmp_apply_dft_to_dft_inplace(apply, tmp_bytes);
+}
+
+TEST(vec_rnx, vmp_apply_to_dft) {
+  test_vmp_apply_dft_to_dft(rnx_vmp_apply_dft_to_dft, rnx_vmp_apply_dft_to_dft_tmp_bytes);
+}
+TEST(vec_rnx, fft64_vmp_apply_dft_to_dft_ref) {
+  test_vmp_apply_dft_to_dft(fft64_rnx_vmp_apply_dft_to_dft_ref, fft64_rnx_vmp_apply_dft_to_dft_tmp_bytes_ref);
+}
+#ifdef __x86_64__
+TEST(vec_rnx, fft64_vmp_apply_dft_to_dft_avx) {
+  test_vmp_apply_dft_to_dft(fft64_rnx_vmp_apply_dft_to_dft_avx, fft64_rnx_vmp_apply_dft_to_dft_tmp_bytes_avx);
+}
+#endif
+
+/// rnx_vmp_prepare
+
+static void test_vmp_prepare_contiguous(RNX_VMP_PREPARE_CONTIGUOUS_F* prepare_contiguous,
+                                        RNX_VMP_PREPARE_CONTIGUOUS_TMP_BYTES_F* tmp_bytes) {
+  // tests when n < 8
+  for (uint64_t nn : {2, 4}) {
+    const double one_over_m = 2. / nn;
+    MOD_RNX* module = new_rnx_module_info(nn, FFT64);
+    for (uint64_t nrows : {1, 2, 5}) {
+      for (uint64_t ncols : {2, 6, 7}) {
+        rnx_vec_f64_layout mat(nn, nrows * ncols, nn);
+        fft64_rnx_vmp_pmat_layout pmat(nn, nrows, ncols);
+        mat.fill_random(0);
+        std::vector<uint8_t> tmp_space(tmp_bytes(module));
+        thash hash_before = mat.content_hash();
+        prepare_contiguous(module, pmat.data, mat.data(), nrows, ncols, tmp_space.data());
+        ASSERT_EQ(mat.content_hash(), hash_before);
+        for (uint64_t row = 0; row < nrows; ++row) {
+          for (uint64_t col = 0; col < ncols; ++col) {
+            const double* pmatv = (double*)pmat.data + (col * nrows + row) * nn;
+            reim_fft64vec tmp = one_over_m * simple_fft64(mat.get_copy(row * ncols + col));
+            const double* tmpv = tmp.data();
+            for (uint64_t i = 0; i < nn; ++i) {
+              ASSERT_LE(abs(pmatv[i] - tmpv[i]), 1e-10);
+            }
+          }
+        }
+      }
+    }
+    delete_rnx_module_info(module);
+  }
+  // tests when n >= 8
+  for (uint64_t nn : {8, 32}) {
+    const double one_over_m = 2. / nn;
+    MOD_RNX* module = new_rnx_module_info(nn, FFT64);
+    uint64_t nblk = nn / 8;
+    for (uint64_t nrows : {1, 2, 5}) {
+      for (uint64_t ncols : {2, 6, 7}) {
+        rnx_vec_f64_layout mat(nn, nrows * ncols, nn);
+        fft64_rnx_vmp_pmat_layout pmat(nn, nrows, ncols);
+        mat.fill_random(0);
+        std::vector<uint8_t> tmp_space(tmp_bytes(module));
+        thash hash_before = mat.content_hash();
+        prepare_contiguous(module, pmat.data, mat.data(), nrows, ncols, tmp_space.data());
+        ASSERT_EQ(mat.content_hash(), hash_before);
+        for (uint64_t row = 0; row < nrows; ++row) {
+          for (uint64_t col = 0; col < ncols; ++col) {
+            reim_fft64vec tmp = one_over_m * simple_fft64(mat.get_copy(row * ncols + col));
+            for (uint64_t blk = 0; blk < nblk; ++blk) {
+              reim4_elem expect = tmp.get_blk(blk);
+              reim4_elem actual = pmat.get(row, col, blk);
+              ASSERT_LE(infty_dist(actual, expect), 1e-10);
+            }
+          }
+        }
+      }
+    }
+    delete_rnx_module_info(module);
+  }
+}
+
+TEST(vec_rnx, vmp_prepare_contiguous) {
+  test_vmp_prepare_contiguous(rnx_vmp_prepare_contiguous, rnx_vmp_prepare_contiguous_tmp_bytes);
+}
+TEST(vec_rnx, fft64_vmp_prepare_contiguous_ref) {
+  test_vmp_prepare_contiguous(fft64_rnx_vmp_prepare_contiguous_ref, fft64_rnx_vmp_prepare_contiguous_tmp_bytes_ref);
+}
+#ifdef __x86_64__
+TEST(vec_rnx, fft64_vmp_prepare_contiguous_avx) {
+  test_vmp_prepare_contiguous(fft64_rnx_vmp_prepare_contiguous_avx, fft64_rnx_vmp_prepare_contiguous_tmp_bytes_avx);
+}
+#endif
+
+/// rnx_vmp_apply_dft_to_dft
+
+static void test_vmp_apply_tmp_a_outplace(  //
+    RNX_VMP_APPLY_TMP_A_F* apply,           //
+    RNX_VMP_APPLY_TMP_A_TMP_BYTES_F* tmp_bytes) {
+  for (uint64_t nn : {2, 4, 8, 64}) {
+    MOD_RNX* module = new_rnx_module_info(nn, FFT64);
+    for (uint64_t mat_nrows : {1, 4, 7}) {
+      for (uint64_t mat_ncols : {1, 2, 5}) {
+        for (uint64_t in_size : {1, 4, 7}) {
+          for (uint64_t out_size : {1, 2, 5}) {
+            const uint64_t in_sl = nn + uniform_u64_bits(2);
+            const uint64_t out_sl = nn + uniform_u64_bits(2);
+            rnx_vec_f64_layout in(nn, in_size, in_sl);
+            fft64_rnx_vmp_pmat_layout pmat(nn, mat_nrows, mat_ncols);
+            rnx_vec_f64_layout out(nn, out_size, out_sl);
+            in.fill_random(0);
+            pmat.fill_random(0);
+            // naive computation of the product
+            std::vector<rnx_f64> expect(out_size);
+            for (uint64_t col = 0; col < out_size; ++col) {
+              reim_fft64vec ex = reim_fft64vec::zero(nn);
+              for (uint64_t row = 0; row < std::min(mat_nrows, in_size); ++row) {
+                ex += pmat.get_zext(row, col) * simple_fft64(in.get_copy(row));
+              }
+              expect[col] = simple_ifft64(ex);
+            }
+            // apply the product
+            std::vector<uint8_t> tmp(tmp_bytes(module, out_size, in_size, mat_nrows, mat_ncols));
+            apply(module,                           //
+                  out.data(), out_size, out_sl,     //
+                  in.data(), in_size, in_sl,        //
+                  pmat.data, mat_nrows, mat_ncols,  //
+                  tmp.data());
+            // check that the output is close from the expectation
+            for (uint64_t col = 0; col < out_size; ++col) {
+              rnx_f64 actual = out.get_copy_zext(col);
+              ASSERT_LE(infty_dist(actual, expect[col]), 1e-10);
+            }
+          }
+        }
+      }
+    }
+    delete_rnx_module_info(module);
+  }
+}
+
+static void test_vmp_apply_tmp_a_inplace(  //
+    RNX_VMP_APPLY_TMP_A_F* apply,          //
+    RNX_VMP_APPLY_TMP_A_TMP_BYTES_F* tmp_bytes) {
+  for (uint64_t nn : {2, 4, 8, 64}) {
+    MOD_RNX* module = new_rnx_module_info(nn, FFT64);
+    for (uint64_t mat_nrows : {1, 4, 7}) {
+      for (uint64_t mat_ncols : {1, 2, 5}) {
+        for (uint64_t in_size : {1, 4, 7}) {
+          for (uint64_t out_size : {1, 2, 5}) {
+            const uint64_t in_out_sl = nn + uniform_u64_bits(2);
+            rnx_vec_f64_layout in_out(nn, std::max(in_size, out_size), in_out_sl);
+            fft64_rnx_vmp_pmat_layout pmat(nn, mat_nrows, mat_ncols);
+            in_out.fill_random(0);
+            pmat.fill_random(0);
+            // naive computation of the product
+            std::vector<rnx_f64> expect(out_size);
+            for (uint64_t col = 0; col < out_size; ++col) {
+              reim_fft64vec ex = reim_fft64vec::zero(nn);
+              for (uint64_t row = 0; row < std::min(mat_nrows, in_size); ++row) {
+                ex += pmat.get_zext(row, col) * simple_fft64(in_out.get_copy(row));
+              }
+              expect[col] = simple_ifft64(ex);
+            }
+            // apply the product
+            std::vector<uint8_t> tmp(tmp_bytes(module, out_size, in_size, mat_nrows, mat_ncols));
+            apply(module,                              //
+                  in_out.data(), out_size, in_out_sl,  //
+                  in_out.data(), in_size, in_out_sl,   //
+                  pmat.data, mat_nrows, mat_ncols,     //
+                  tmp.data());
+            // check that the output is close from the expectation
+            for (uint64_t col = 0; col < out_size; ++col) {
+              rnx_f64 actual = in_out.get_copy_zext(col);
+              ASSERT_LE(infty_dist(actual, expect[col]), 1e-10);
+            }
+          }
+        }
+      }
+    }
+    delete_rnx_module_info(module);
+  }
+}
+
+static void test_vmp_apply_tmp_a(  //
+    RNX_VMP_APPLY_TMP_A_F* apply,  //
+    RNX_VMP_APPLY_TMP_A_TMP_BYTES_F* tmp_bytes) {
+  test_vmp_apply_tmp_a_outplace(apply, tmp_bytes);
+  test_vmp_apply_tmp_a_inplace(apply, tmp_bytes);
+}
+
+TEST(vec_znx, fft64_vmp_apply_tmp_a) { test_vmp_apply_tmp_a(rnx_vmp_apply_tmp_a, rnx_vmp_apply_tmp_a_tmp_bytes); }
+TEST(vec_znx, fft64_vmp_apply_tmp_a_ref) {
+  test_vmp_apply_tmp_a(fft64_rnx_vmp_apply_tmp_a_ref, fft64_rnx_vmp_apply_tmp_a_tmp_bytes_ref);
+}
+#ifdef __x86_64__
+TEST(vec_znx, fft64_vmp_apply_tmp_a_avx) {
+  test_vmp_apply_tmp_a(fft64_rnx_vmp_apply_tmp_a_avx, fft64_rnx_vmp_apply_tmp_a_tmp_bytes_avx);
+}
+#endif
--- a/spqlios/lib/test/spqlios_vec_znx_big_test.cpp
+++ b/spqlios/lib/test/spqlios_vec_znx_big_test.cpp
@@ -0,0 +1,265 @@
+#include <gtest/gtest.h>
+
+#include "spqlios/arithmetic/vec_znx_arithmetic_private.h"
+#include "test/testlib/polynomial_vector.h"
+#include "testlib/fft64_layouts.h"
+#include "testlib/test_commons.h"
+
+#define def_rand_big(varname, ringdim, varsize)       \
+  fft64_vec_znx_big_layout varname(ringdim, varsize); \
+  varname.fill_random()
+
+#define def_rand_small(varname, ringdim, varsize)            \
+  znx_vec_i64_layout varname(ringdim, varsize, 2 * ringdim); \
+  varname.fill_random()
+
+#define test_prelude(ringdim, moduletype, dim1, dim2, dim3) \
+  uint64_t n = ringdim;                                     \
+  MODULE* module = new_module_info(ringdim, moduletype);    \
+  for (uint64_t sa : {dim1, dim2, dim3}) {                  \
+    for (uint64_t sb : {dim1, dim2, dim3}) {                \
+      for (uint64_t sr : {dim1, dim2, dim3})
+
+#define test_end() \
+  }                \
+  }                \
+  free(module)
+
+void test_fft64_vec_znx_big_add(VEC_ZNX_BIG_ADD_F vec_znx_big_add_fcn) {
+  test_prelude(8, FFT64, 3, 5, 7) {
+    fft64_vec_znx_big_layout r(n, sr);
+    def_rand_big(a, n, sa);
+    def_rand_big(b, n, sb);
+    vec_znx_big_add_fcn(module, r.data, sr, a.data, sa, b.data, sb);
+    for (uint64_t i = 0; i < sr; ++i) {
+      ASSERT_EQ(r.get_copy(i), a.get_copy_zext(i) + b.get_copy_zext(i));
+    }
+  }
+  test_end();
+}
+void test_fft64_vec_znx_big_add_small(VEC_ZNX_BIG_ADD_SMALL_F vec_znx_big_add_fcn) {
+  test_prelude(16, FFT64, 2, 4, 5) {
+    fft64_vec_znx_big_layout r(n, sr);
+    def_rand_big(a, n, sa);
+    def_rand_small(b, n, sb);
+    vec_znx_big_add_fcn(module, r.data, sr, a.data, sa, b.data(), sb, 2 * n);
+    for (uint64_t i = 0; i < sr; ++i) {
+      ASSERT_EQ(r.get_copy(i), a.get_copy_zext(i) + b.get_copy_zext(i));
+    }
+  }
+  test_end();
+}
+void test_fft64_vec_znx_big_add_small2(VEC_ZNX_BIG_ADD_SMALL2_F vec_znx_big_add_fcn) {
+  test_prelude(64, FFT64, 3, 6, 7) {
+    fft64_vec_znx_big_layout r(n, sr);
+    def_rand_small(a, n, sa);
+    def_rand_small(b, n, sb);
+    vec_znx_big_add_fcn(module, r.data, sr, a.data(), sa, 2 * n, b.data(), sb, 2 * n);
+    for (uint64_t i = 0; i < sr; ++i) {
+      ASSERT_EQ(r.get_copy(i), a.get_copy_zext(i) + b.get_copy_zext(i));
+    }
+  }
+  test_end();
+}
+
+TEST(fft64_vec_znx_big, fft64_vec_znx_big_add) { test_fft64_vec_znx_big_add(fft64_vec_znx_big_add); }
+TEST(vec_znx_big, vec_znx_big_add) { test_fft64_vec_znx_big_add(vec_znx_big_add); }
+
+TEST(fft64_vec_znx_big, fft64_vec_znx_big_add_small) { test_fft64_vec_znx_big_add_small(fft64_vec_znx_big_add_small); }
+TEST(vec_znx_big, vec_znx_big_add_small) { test_fft64_vec_znx_big_add_small(vec_znx_big_add_small); }
+
+TEST(fft64_vec_znx_big, fft64_vec_znx_big_add_small2) {
+  test_fft64_vec_znx_big_add_small2(fft64_vec_znx_big_add_small2);
+}
+TEST(vec_znx_big, vec_znx_big_add_small2) { test_fft64_vec_znx_big_add_small2(vec_znx_big_add_small2); }
+
+void test_fft64_vec_znx_big_sub(VEC_ZNX_BIG_SUB_F vec_znx_big_sub_fcn) {
+  test_prelude(16, FFT64, 3, 5, 7) {
+    fft64_vec_znx_big_layout r(n, sr);
+    def_rand_big(a, n, sa);
+    def_rand_big(b, n, sb);
+    vec_znx_big_sub_fcn(module, r.data, sr, a.data, sa, b.data, sb);
+    for (uint64_t i = 0; i < sr; ++i) {
+      ASSERT_EQ(r.get_copy(i), a.get_copy_zext(i) - b.get_copy_zext(i));
+    }
+  }
+  test_end();
+}
+void test_fft64_vec_znx_big_sub_small_a(VEC_ZNX_BIG_SUB_SMALL_A_F vec_znx_big_sub_fcn) {
+  test_prelude(32, FFT64, 2, 4, 5) {
+    fft64_vec_znx_big_layout r(n, sr);
+    def_rand_small(a, n, sa);
+    def_rand_big(b, n, sb);
+    vec_znx_big_sub_fcn(module, r.data, sr, a.data(), sa, 2 * n, b.data, sb);
+    for (uint64_t i = 0; i < sr; ++i) {
+      ASSERT_EQ(r.get_copy(i), a.get_copy_zext(i) - b.get_copy_zext(i));
+    }
+  }
+  test_end();
+}
+void test_fft64_vec_znx_big_sub_small_b(VEC_ZNX_BIG_SUB_SMALL_B_F vec_znx_big_sub_fcn) {
+  test_prelude(16, FFT64, 2, 4, 5) {
+    fft64_vec_znx_big_layout r(n, sr);
+    def_rand_big(a, n, sa);
+    def_rand_small(b, n, sb);
+    vec_znx_big_sub_fcn(module, r.data, sr, a.data, sa, b.data(), sb, 2 * n);
+    for (uint64_t i = 0; i < sr; ++i) {
+      ASSERT_EQ(r.get_copy(i), a.get_copy_zext(i) - b.get_copy_zext(i));
+    }
+  }
+  test_end();
+}
+void test_fft64_vec_znx_big_sub_small2(VEC_ZNX_BIG_SUB_SMALL2_F vec_znx_big_sub_fcn) {
+  test_prelude(8, FFT64, 3, 6, 7) {
+    fft64_vec_znx_big_layout r(n, sr);
+    def_rand_small(a, n, sa);
+    def_rand_small(b, n, sb);
+    vec_znx_big_sub_fcn(module, r.data, sr, a.data(), sa, 2 * n, b.data(), sb, 2 * n);
+    for (uint64_t i = 0; i < sr; ++i) {
+      ASSERT_EQ(r.get_copy(i), a.get_copy_zext(i) - b.get_copy_zext(i));
+    }
+  }
+  test_end();
+}
+
+TEST(fft64_vec_znx_big, fft64_vec_znx_big_sub) { test_fft64_vec_znx_big_sub(fft64_vec_znx_big_sub); }
+TEST(vec_znx_big, vec_znx_big_sub) { test_fft64_vec_znx_big_sub(vec_znx_big_sub); }
+
+TEST(fft64_vec_znx_big, fft64_vec_znx_big_sub_small_a) {
+  test_fft64_vec_znx_big_sub_small_a(fft64_vec_znx_big_sub_small_a);
+}
+TEST(vec_znx_big, vec_znx_big_sub_small_a) { test_fft64_vec_znx_big_sub_small_a(vec_znx_big_sub_small_a); }
+
+TEST(fft64_vec_znx_big, fft64_vec_znx_big_sub_small_b) {
+  test_fft64_vec_znx_big_sub_small_b(fft64_vec_znx_big_sub_small_b);
+}
+TEST(vec_znx_big, vec_znx_big_sub_small_b) { test_fft64_vec_znx_big_sub_small_b(vec_znx_big_sub_small_b); }
+
+TEST(fft64_vec_znx_big, fft64_vec_znx_big_sub_small2) {
+  test_fft64_vec_znx_big_sub_small2(fft64_vec_znx_big_sub_small2);
+}
+TEST(vec_znx_big, vec_znx_big_sub_small2) { test_fft64_vec_znx_big_sub_small2(vec_znx_big_sub_small2); }
+
+static void test_vec_znx_big_normalize(VEC_ZNX_BIG_NORMALIZE_BASE2K_F normalize,
+                                       VEC_ZNX_BIG_NORMALIZE_BASE2K_TMP_BYTES_F normalize_tmp_bytes) {
+  // in the FFT64 case, big_normalize is just a forward.
+  // we will just test that the functions are callable
+  uint64_t n = 16;
+  uint64_t k = 12;
+  MODULE* module = new_module_info(n, FFT64);
+  for (uint64_t sa : {3, 5, 7}) {
+    for (uint64_t sr : {3, 5, 7}) {
+      uint64_t r_sl = n + 3;
+      def_rand_big(a, n, sa);
+      znx_vec_i64_layout r(n, sr, r_sl);
+      std::vector<uint8_t> tmp_space(normalize_tmp_bytes(module));
+      normalize(module, k, r.data(), sr, r_sl, a.data, sa, tmp_space.data());
+    }
+  }
+  delete_module_info(module);
+}
+
+TEST(vec_znx_big, fft64_vec_znx_big_normalize_base2k) {
+  test_vec_znx_big_normalize(fft64_vec_znx_big_normalize_base2k, fft64_vec_znx_big_normalize_base2k_tmp_bytes);
+}
+TEST(vec_znx_big, vec_znx_big_normalize_base2k) {
+  test_vec_znx_big_normalize(vec_znx_big_normalize_base2k, vec_znx_big_normalize_base2k_tmp_bytes);
+}
+
+static void test_vec_znx_big_range_normalize(  //
+    VEC_ZNX_BIG_RANGE_NORMALIZE_BASE2K_F normalize,
+    VEC_ZNX_BIG_RANGE_NORMALIZE_BASE2K_TMP_BYTES_F normalize_tmp_bytes) {
+  // in the FFT64 case, big_normalize is just a forward.
+  // we will just test that the functions are callable
+  uint64_t n = 16;
+  uint64_t k = 11;
+  MODULE* module = new_module_info(n, FFT64);
+  for (uint64_t sa : {6, 15, 21}) {
+    for (uint64_t sr : {3, 5, 7}) {
+      uint64_t r_sl = n + 3;
+      def_rand_big(a, n, sa);
+      uint64_t a_start = uniform_u64_bits(30) % (sa / 2);
+      uint64_t a_end = sa - (uniform_u64_bits(30) % (sa / 2));
+      uint64_t a_step = (uniform_u64_bits(30) % 3) + 1;
+      uint64_t range_size = (a_end + a_step - 1 - a_start) / a_step;
+      fft64_vec_znx_big_layout aextr(n, range_size);
+      for (uint64_t i = 0, idx = a_start; idx < a_end; ++i, idx += a_step) {
+        aextr.set(i, a.get_copy(idx));
+      }
+      znx_vec_i64_layout r(n, sr, r_sl);
+      znx_vec_i64_layout r2(n, sr, r_sl);
+      // tmp_space is large-enough for both
+      std::vector<uint8_t> tmp_space(normalize_tmp_bytes(module));
+      normalize(module, k, r.data(), sr, r_sl, a.data, a_start, a_end, a_step, tmp_space.data());
+      fft64_vec_znx_big_normalize_base2k(module, k, r2.data(), sr, r_sl, aextr.data, range_size, tmp_space.data());
+      for (uint64_t i = 0; i < sr; ++i) {
+        ASSERT_EQ(r.get_copy(i), r2.get_copy(i));
+      }
+    }
+  }
+  delete_module_info(module);
+}
+
+TEST(vec_znx_big, fft64_vec_znx_big_range_normalize_base2k) {
+  test_vec_znx_big_range_normalize(fft64_vec_znx_big_range_normalize_base2k,
+                                   fft64_vec_znx_big_range_normalize_base2k_tmp_bytes);
+}
+TEST(vec_znx_big, vec_znx_big_range_normalize_base2k) {
+  test_vec_znx_big_range_normalize(vec_znx_big_range_normalize_base2k, vec_znx_big_range_normalize_base2k_tmp_bytes);
+}
+
+static void test_vec_znx_big_rotate(VEC_ZNX_BIG_ROTATE_F rotate) {
+  // in the FFT64 case, big_normalize is just a forward.
+  // we will just test that the functions are callable
+  uint64_t n = 16;
+  int64_t p = 12;
+  MODULE* module = new_module_info(n, FFT64);
+  for (uint64_t sa : {3, 5, 7}) {
+    for (uint64_t sr : {3, 5, 7}) {
+      def_rand_big(a, n, sa);
+      fft64_vec_znx_big_layout r(n, sr);
+      rotate(module, p, r.data, sr, a.data, sa);
+      for (uint64_t i = 0; i < sr; ++i) {
+        znx_i64 aa = a.get_copy_zext(i);
+        znx_i64 expect(n);
+        for (uint64_t j = 0; j < n; ++j) {
+          expect.set_coeff(j, aa.get_coeff(int64_t(j) - p));
+        }
+        znx_i64 actual = r.get_copy(i);
+        ASSERT_EQ(expect, actual);
+      }
+    }
+  }
+  delete_module_info(module);
+}
+
+TEST(vec_znx_big, fft64_vec_znx_big_rotate) { test_vec_znx_big_rotate(fft64_vec_znx_big_rotate); }
+TEST(vec_znx_big, vec_znx_big_rotate) { test_vec_znx_big_rotate(vec_znx_big_rotate); }
+
+static void test_vec_znx_big_automorphism(VEC_ZNX_BIG_AUTOMORPHISM_F automorphism) {
+  // in the FFT64 case, big_normalize is just a forward.
+  // we will just test that the functions are callable
+  uint64_t n = 16;
+  int64_t p = 11;
+  MODULE* module = new_module_info(n, FFT64);
+  for (uint64_t sa : {3, 5, 7}) {
+    for (uint64_t sr : {3, 5, 7}) {
+      def_rand_big(a, n, sa);
+      fft64_vec_znx_big_layout r(n, sr);
+      automorphism(module, p, r.data, sr, a.data, sa);
+      for (uint64_t i = 0; i < sr; ++i) {
+        znx_i64 aa = a.get_copy_zext(i);
+        znx_i64 expect(n);
+        for (uint64_t j = 0; j < n; ++j) {
+          expect.set_coeff(p * j, aa.get_coeff(j));
+        }
+        znx_i64 actual = r.get_copy(i);
+        ASSERT_EQ(expect, actual);
+      }
+    }
+  }
+  delete_module_info(module);
+}
+
+TEST(vec_znx_big, fft64_vec_znx_big_automorphism) { test_vec_znx_big_automorphism(fft64_vec_znx_big_automorphism); }
+TEST(vec_znx_big, vec_znx_big_automorphism) { test_vec_znx_big_automorphism(vec_znx_big_automorphism); }
--- a/spqlios/lib/test/spqlios_vec_znx_dft_test.cpp
+++ b/spqlios/lib/test/spqlios_vec_znx_dft_test.cpp
@@ -0,0 +1,193 @@
+#include <gtest/gtest.h>
+
+#include <cstdint>
+
+#include "../spqlios/arithmetic/vec_znx_arithmetic_private.h"
+#include "spqlios/arithmetic/vec_znx_arithmetic.h"
+#include "test/testlib/ntt120_dft.h"
+#include "test/testlib/ntt120_layouts.h"
+#include "testlib/fft64_dft.h"
+#include "testlib/fft64_layouts.h"
+#include "testlib/polynomial_vector.h"
+
+static void test_fft64_vec_znx_dft(VEC_ZNX_DFT_F dft) {
+  for (uint64_t n : {2, 4, 128}) {
+    MODULE* module = new_module_info(n, FFT64);
+    for (uint64_t sa : {3, 5, 8}) {
+      for (uint64_t sr : {3, 5, 8}) {
+        uint64_t a_sl = n + uniform_u64_bits(2);
+        znx_vec_i64_layout a(n, sa, a_sl);
+        fft64_vec_znx_dft_layout res(n, sr);
+        a.fill_random(42);
+        std::vector<reim_fft64vec> expect(sr);
+        for (uint64_t i = 0; i < sr; ++i) {
+          expect[i] = simple_fft64(a.get_copy_zext(i));
+        }
+        // test the function
+        thash hash_before = a.content_hash();
+        dft(module, res.data, sr, a.data(), sa, a_sl);
+        ASSERT_EQ(a.content_hash(), hash_before);
+        for (uint64_t i = 0; i < sr; ++i) {
+          reim_fft64vec actual = res.get_copy_zext(i);
+          ASSERT_LE(infty_dist(actual, expect[i]), 1e-10);
+        }
+      }
+    }
+    delete_module_info(module);
+  }
+}
+
+#ifdef __x86_64__
+// FIXME: currently, it only works on avx
+static void test_ntt120_vec_znx_dft(VEC_ZNX_DFT_F dft) {
+  for (uint64_t n : {2, 4, 128}) {
+    MODULE* module = new_module_info(n, NTT120);
+    for (uint64_t sa : {3, 5, 8}) {
+      for (uint64_t sr : {3, 5, 8}) {
+        uint64_t a_sl = n + uniform_u64_bits(2);
+        znx_vec_i64_layout a(n, sa, a_sl);
+        ntt120_vec_znx_dft_layout res(n, sr);
+        a.fill_random(42);
+        std::vector<q120_nttvec> expect(sr);
+        for (uint64_t i = 0; i < sr; ++i) {
+          expect[i] = simple_ntt120(a.get_copy_zext(i));
+        }
+        // test the function
+        thash hash_before = a.content_hash();
+        dft(module, res.data, sr, a.data(), sa, a_sl);
+        ASSERT_EQ(a.content_hash(), hash_before);
+        for (uint64_t i = 0; i < sr; ++i) {
+          q120_nttvec actual = res.get_copy_zext(i);
+          if (!(actual == expect[i])) {
+            for (uint64_t j = 0; j < n; ++j) {
+              std::cerr << actual.v[j] << " vs " << expect[i].v[j] << std::endl;
+            }
+          }
+          ASSERT_EQ(actual, expect[i]);
+        }
+      }
+    }
+    delete_module_info(module);
+  }
+}
+#endif
+
+TEST(vec_znx_dft, fft64_vec_znx_dft) { test_fft64_vec_znx_dft(fft64_vec_znx_dft); }
+#ifdef __x86_64__
+// FIXME: currently, it only works on avx
+TEST(vec_znx_dft, ntt120_vec_znx_dft) { test_ntt120_vec_znx_dft(ntt120_vec_znx_dft_avx); }
+#endif
+TEST(vec_znx_dft, vec_znx_dft) {
+  test_fft64_vec_znx_dft(vec_znx_dft);
+#ifdef __x86_64__
+  // FIXME: currently, it only works on avx
+  test_ntt120_vec_znx_dft(ntt120_vec_znx_dft_avx);
+#endif
+}
+
+static void test_fft64_vec_znx_idft(VEC_ZNX_IDFT_F idft, VEC_ZNX_IDFT_TMP_A_F idft_tmp_a,
+                                    VEC_ZNX_IDFT_TMP_BYTES_F idft_tmp_bytes) {
+  for (uint64_t n : {2, 4, 64, 128}) {
+    MODULE* module = new_module_info(n, FFT64);
+    uint64_t tmp_size = idft_tmp_bytes ? idft_tmp_bytes(module) : 0;
+    std::vector<uint8_t> tmp(tmp_size);
+    for (uint64_t sa : {3, 5, 8}) {
+      for (uint64_t sr : {3, 5, 8}) {
+        fft64_vec_znx_dft_layout a(n, sa);
+        fft64_vec_znx_big_layout res(n, sr);
+        a.fill_dft_random_log2bound(22);
+        std::vector<znx_i64> expect(sr);
+        for (uint64_t i = 0; i < sr; ++i) {
+          expect[i] = simple_rint_ifft64(a.get_copy_zext(i));
+        }
+        // test the function
+        if (idft_tmp_bytes) {
+          thash hash_before = a.content_hash();
+          idft(module, res.data, sr, a.data, sa, tmp.data());
+          ASSERT_EQ(a.content_hash(), hash_before);
+        } else {
+          idft_tmp_a(module, res.data, sr, a.data, sa);
+        }
+        for (uint64_t i = 0; i < sr; ++i) {
+          znx_i64 actual = res.get_copy_zext(i);
+          // ASSERT_EQ(res.get_copy_zext(i), expect[i]);
+          if (!(actual == expect[i])) {
+            for (uint64_t j = 0; j < n; ++j) {
+              std::cerr << actual.get_coeff(j) << " dft vs. " << expect[i].get_coeff(j) << std::endl;
+            }
+            FAIL();
+          }
+        }
+      }
+    }
+    delete_module_info(module);
+  }
+}
+
+TEST(vec_znx_dft, fft64_vec_znx_idft) {
+  test_fft64_vec_znx_idft(fft64_vec_znx_idft, nullptr, fft64_vec_znx_idft_tmp_bytes);
+}
+TEST(vec_znx_dft, fft64_vec_znx_idft_tmp_a) { test_fft64_vec_znx_idft(nullptr, fft64_vec_znx_idft_tmp_a, nullptr); }
+
+#ifdef __x86_64__
+// FIXME: currently, it only works on avx
+static void test_ntt120_vec_znx_idft(VEC_ZNX_IDFT_F idft, VEC_ZNX_IDFT_TMP_A_F idft_tmp_a,
+                                     VEC_ZNX_IDFT_TMP_BYTES_F idft_tmp_bytes) {
+  for (uint64_t n : {2, 4, 64, 128}) {
+    MODULE* module = new_module_info(n, NTT120);
+    uint64_t tmp_size = idft_tmp_bytes ? idft_tmp_bytes(module) : 0;
+    std::vector<uint8_t> tmp(tmp_size);
+    for (uint64_t sa : {3, 5, 8}) {
+      for (uint64_t sr : {3, 5, 8}) {
+        ntt120_vec_znx_dft_layout a(n, sa);
+        ntt120_vec_znx_big_layout res(n, sr);
+        a.fill_random();
+        std::vector<znx_i128> expect(sr);
+        for (uint64_t i = 0; i < sr; ++i) {
+          expect[i] = simple_intt120(a.get_copy_zext(i));
+        }
+        // test the function
+        if (idft_tmp_bytes) {
+          thash hash_before = a.content_hash();
+          idft(module, res.data, sr, a.data, sa, tmp.data());
+          ASSERT_EQ(a.content_hash(), hash_before);
+        } else {
+          idft_tmp_a(module, res.data, sr, a.data, sa);
+        }
+        for (uint64_t i = 0; i < sr; ++i) {
+          znx_i128 actual = res.get_copy_zext(i);
+          ASSERT_EQ(res.get_copy_zext(i), expect[i]);
+          // if (!(actual == expect[i])) {
+          //   for (uint64_t j = 0; j < n; ++j) {
+          //     std::cerr << actual.get_coeff(j) << " dft vs. " << expect[i].get_coeff(j) << std::endl;
+          //   }
+          //   FAIL();
+          // }
+        }
+      }
+    }
+    delete_module_info(module);
+  }
+}
+
+TEST(vec_znx_dft, ntt120_vec_znx_idft) {
+  test_ntt120_vec_znx_idft(ntt120_vec_znx_idft_avx, nullptr, ntt120_vec_znx_idft_tmp_bytes_avx);
+}
+TEST(vec_znx_dft, ntt120_vec_znx_idft_tmp_a) {
+  test_ntt120_vec_znx_idft(nullptr, ntt120_vec_znx_idft_tmp_a_avx, nullptr);
+}
+#endif
+TEST(vec_znx_dft, vec_znx_idft) {
+  test_fft64_vec_znx_idft(vec_znx_idft, nullptr, vec_znx_idft_tmp_bytes);
+#ifdef __x86_64__
+  // FIXME: currently, only supported on avx
+  test_ntt120_vec_znx_idft(vec_znx_idft, nullptr, vec_znx_idft_tmp_bytes);
+#endif
+}
+TEST(vec_znx_dft, vec_znx_idft_tmp_a) {
+  test_fft64_vec_znx_idft(nullptr, vec_znx_idft_tmp_a, nullptr);
+#ifdef __x86_64__
+  // FIXME: currently, only supported on avx
+  test_ntt120_vec_znx_idft(nullptr, vec_znx_idft_tmp_a, nullptr);
+#endif
+}
--- a/spqlios/lib/test/spqlios_vec_znx_test.cpp
+++ b/spqlios/lib/test/spqlios_vec_znx_test.cpp
@@ -0,0 +1,546 @@
+#include <cstdint>
+#include <utility>
+
+#include "../spqlios/arithmetic/vec_znx_arithmetic.h"
+#include "gtest/gtest.h"
+#include "spqlios/arithmetic/vec_znx_arithmetic_private.h"
+#include "spqlios/coeffs/coeffs_arithmetic.h"
+#include "test/testlib/mod_q120.h"
+#include "test/testlib/negacyclic_polynomial.h"
+#include "testlib/fft64_dft.h"
+#include "testlib/polynomial_vector.h"
+
+TEST(fft64_layouts, dft_idft_fft64) {
+  uint64_t n = 128;
+  // create a random polynomial
+  znx_i64 p(n);
+  for (uint64_t i = 0; i < n; ++i) {
+    p.set_coeff(i, uniform_i64_bits(36));
+  }
+  // call fft
+  reim_fft64vec q = simple_fft64(p);
+  // call ifft and round
+  znx_i64 r = simple_rint_ifft64(q);
+  ASSERT_EQ(p, r);
+}
+
+TEST(znx_layout, valid_test) {
+  uint64_t n = 4;
+  znx_vec_i64_layout v(n, 7, 13);
+  // this should be ok
+  v.set(0, znx_i64::zero(n));
+  // this should be ok
+  ASSERT_EQ(v.get_copy_zext(0), znx_i64::zero(n));
+  ASSERT_EQ(v.data()[2], 0);  // should be ok
+  // this is also ok (zero extended vector)
+  ASSERT_EQ(v.get_copy_zext(1000), znx_i64::zero(n));
+}
+
+// disabling this test by default, since it depicts on purpose wrong accesses
+#if 0
+TEST(znx_layout, valgrind_antipattern_test) {
+  uint64_t n = 4;
+  znx_vec_i64_layout v(n, 7, 13);
+  // this should be ok
+  v.set(0, znx_i64::zero(n));
+  // this should abort (wrong ring dimension)
+  ASSERT_DEATH(v.set(3, znx_i64::zero(2 * n)), "");
+  // this should abort (out of bounds)
+  ASSERT_DEATH(v.set(8, znx_i64::zero(n)), "");
+  // this should be ok
+  ASSERT_EQ(v.get_copy_zext(0), znx_i64::zero(n));
+  // should be an uninit read
+  ASSERT_TRUE(!(v.get_copy_zext(2) == znx_i64::zero(n)));  // should be uninit
+  // should be an invalid read (inter-slice)
+  ASSERT_NE(v.data()[4], 0);
+  ASSERT_EQ(v.data()[2], 0);  // should be ok
+  // should be an uninit read
+  ASSERT_NE(v.data()[13], 0);  // should be uninit
+}
+#endif
+
+// test of binary operations
+
+// test for out of place calls
+template <typename ACTUAL_FCN, typename EXPECT_FCN>
+void test_vec_znx_elemw_binop_outplace(ACTUAL_FCN binop, EXPECT_FCN ref_binop) {
+  for (uint64_t n : {2, 4, 8, 128}) {
+    MODULE_TYPE mtype = uniform_u64() % 2 == 0 ? FFT64 : NTT120;
+    MODULE* mod = new_module_info(n, mtype);
+    for (uint64_t sa : {7, 13, 15}) {
+      for (uint64_t sb : {7, 13, 15}) {
+        for (uint64_t sc : {7, 13, 15}) {
+          uint64_t a_sl = uniform_u64_bits(3) * 5 + n;
+          uint64_t b_sl = uniform_u64_bits(3) * 5 + n;
+          uint64_t c_sl = uniform_u64_bits(3) * 5 + n;
+          znx_vec_i64_layout la(n, sa, a_sl);
+          znx_vec_i64_layout lb(n, sb, b_sl);
+          znx_vec_i64_layout lc(n, sc, c_sl);
+          std::vector<znx_i64> expect(sc);
+          for (uint64_t i = 0; i < sa; ++i) {
+            la.set(i, znx_i64::random_log2bound(n, 62));
+          }
+          for (uint64_t i = 0; i < sb; ++i) {
+            lb.set(i, znx_i64::random_log2bound(n, 62));
+          }
+          for (uint64_t i = 0; i < sc; ++i) {
+            expect[i] = ref_binop(la.get_copy_zext(i), lb.get_copy_zext(i));
+          }
+          binop(mod,                  // N
+                lc.data(), sc, c_sl,  // res
+                la.data(), sa, a_sl,  // a
+                lb.data(), sb, b_sl);
+          for (uint64_t i = 0; i < sc; ++i) {
+            ASSERT_EQ(lc.get_copy_zext(i), expect[i]);
+          }
+        }
+      }
+    }
+    delete_module_info(mod);
+  }
+}
+// test for inplace1 calls
+template <typename ACTUAL_FCN, typename EXPECT_FCN>
+void test_vec_znx_elemw_binop_inplace1(ACTUAL_FCN binop, EXPECT_FCN ref_binop) {
+  for (uint64_t n : {2, 4, 64}) {
+    MODULE_TYPE mtype = uniform_u64() % 2 == 0 ? FFT64 : NTT120;
+    MODULE* mod = new_module_info(n, mtype);
+    for (uint64_t sa : {3, 9, 12}) {
+      for (uint64_t sb : {3, 9, 12}) {
+        uint64_t a_sl = uniform_u64_bits(3) * 5 + n;
+        uint64_t b_sl = uniform_u64_bits(3) * 5 + n;
+        znx_vec_i64_layout la(n, sa, a_sl);
+        znx_vec_i64_layout lb(n, sb, b_sl);
+        std::vector<znx_i64> expect(sa);
+        for (uint64_t i = 0; i < sa; ++i) {
+          la.set(i, znx_i64::random_log2bound(n, 62));
+        }
+        for (uint64_t i = 0; i < sb; ++i) {
+          lb.set(i, znx_i64::random_log2bound(n, 62));
+        }
+        for (uint64_t i = 0; i < sa; ++i) {
+          expect[i] = ref_binop(la.get_copy_zext(i), lb.get_copy_zext(i));
+        }
+        binop(mod,                  // N
+              la.data(), sa, a_sl,  // res
+              la.data(), sa, a_sl,  // a
+              lb.data(), sb, b_sl);
+        for (uint64_t i = 0; i < sa; ++i) {
+          ASSERT_EQ(la.get_copy_zext(i), expect[i]);
+        }
+      }
+    }
+    delete_module_info(mod);
+  }
+}
+// test for inplace2 calls
+template <typename ACTUAL_FCN, typename EXPECT_FCN>
+void test_vec_znx_elemw_binop_inplace2(ACTUAL_FCN binop, EXPECT_FCN ref_binop) {
+  for (uint64_t n : {4, 32, 64}) {
+    MODULE_TYPE mtype = uniform_u64() % 2 == 0 ? FFT64 : NTT120;
+    MODULE* mod = new_module_info(n, mtype);
+    for (uint64_t sa : {3, 9, 12}) {
+      for (uint64_t sb : {3, 9, 12}) {
+        uint64_t a_sl = uniform_u64_bits(3) * 5 + n;
+        uint64_t b_sl = uniform_u64_bits(3) * 5 + n;
+        znx_vec_i64_layout la(n, sa, a_sl);
+        znx_vec_i64_layout lb(n, sb, b_sl);
+        std::vector<znx_i64> expect(sb);
+        for (uint64_t i = 0; i < sa; ++i) {
+          la.set(i, znx_i64::random_log2bound(n, 62));
+        }
+        for (uint64_t i = 0; i < sb; ++i) {
+          lb.set(i, znx_i64::random_log2bound(n, 62));
+        }
+        for (uint64_t i = 0; i < sb; ++i) {
+          expect[i] = ref_binop(la.get_copy_zext(i), lb.get_copy_zext(i));
+        }
+        binop(mod,                  // N
+              lb.data(), sb, b_sl,  // res
+              la.data(), sa, a_sl,  // a
+              lb.data(), sb, b_sl);
+        for (uint64_t i = 0; i < sb; ++i) {
+          ASSERT_EQ(lb.get_copy_zext(i), expect[i]);
+        }
+      }
+    }
+    delete_module_info(mod);
+  }
+}
+// test for inplace3 calls
+template <typename ACTUAL_FCN, typename EXPECT_FCN>
+void test_vec_znx_elemw_binop_inplace3(ACTUAL_FCN binop, EXPECT_FCN ref_binop) {
+  for (uint64_t n : {2, 16, 1024}) {
+    MODULE_TYPE mtype = uniform_u64() % 2 == 0 ? FFT64 : NTT120;
+    MODULE* mod = new_module_info(n, mtype);
+    for (uint64_t sa : {2, 6, 11}) {
+      uint64_t a_sl = uniform_u64_bits(3) * 5 + n;
+      znx_vec_i64_layout la(n, sa, a_sl);
+      std::vector<znx_i64> expect(sa);
+      for (uint64_t i = 0; i < sa; ++i) {
+        la.set(i, znx_i64::random_log2bound(n, 62));
+      }
+      for (uint64_t i = 0; i < sa; ++i) {
+        expect[i] = ref_binop(la.get_copy_zext(i), la.get_copy_zext(i));
+      }
+      binop(mod,                  // N
+            la.data(), sa, a_sl,  // res
+            la.data(), sa, a_sl,  // a
+            la.data(), sa, a_sl);
+      for (uint64_t i = 0; i < sa; ++i) {
+        ASSERT_EQ(la.get_copy_zext(i), expect[i]);
+      }
+    }
+    delete_module_info(mod);
+  }
+}
+template <typename ACTUAL_FCN, typename EXPECT_FCN>
+void test_vec_znx_elemw_binop(ACTUAL_FCN binop, EXPECT_FCN ref_binop) {
+  test_vec_znx_elemw_binop_outplace(binop, ref_binop);
+  test_vec_znx_elemw_binop_inplace1(binop, ref_binop);
+  test_vec_znx_elemw_binop_inplace2(binop, ref_binop);
+  test_vec_znx_elemw_binop_inplace3(binop, ref_binop);
+}
+
+static znx_i64 poly_add(const znx_i64& a, const znx_i64& b) { return a + b; }
+TEST(vec_znx, vec_znx_add) { test_vec_znx_elemw_binop(vec_znx_add, poly_add); }
+TEST(vec_znx, vec_znx_add_ref) { test_vec_znx_elemw_binop(vec_znx_add_ref, poly_add); }
+#ifdef __x86_64__
+TEST(vec_znx, vec_znx_add_avx) { test_vec_znx_elemw_binop(vec_znx_add_avx, poly_add); }
+#endif
+
+static znx_i64 poly_sub(const znx_i64& a, const znx_i64& b) { return a - b; }
+TEST(vec_znx, vec_znx_sub) { test_vec_znx_elemw_binop(vec_znx_sub, poly_sub); }
+TEST(vec_znx, vec_znx_sub_ref) { test_vec_znx_elemw_binop(vec_znx_sub_ref, poly_sub); }
+#ifdef __x86_64__
+TEST(vec_znx, vec_znx_sub_avx) { test_vec_znx_elemw_binop(vec_znx_sub_avx, poly_sub); }
+#endif
+
+// test of rotation operations
+
+// test for out of place calls
+template <typename ACTUAL_FCN, typename EXPECT_FCN>
+void test_vec_znx_elemw_unop_param_outplace(ACTUAL_FCN test_rotate, EXPECT_FCN ref_rotate, int64_t (*param_gen)()) {
+  for (uint64_t n : {2, 4, 8, 128}) {
+    MODULE_TYPE mtype = uniform_u64() % 2 == 0 ? FFT64 : NTT120;
+    MODULE* mod = new_module_info(n, mtype);
+    for (uint64_t sa : {7, 13, 15}) {
+      for (uint64_t sb : {7, 13, 15}) {
+        {
+          int64_t p = param_gen();
+          uint64_t a_sl = uniform_u64_bits(3) * 5 + n;
+          uint64_t b_sl = uniform_u64_bits(3) * 4 + n;
+          znx_vec_i64_layout la(n, sa, a_sl);
+          znx_vec_i64_layout lb(n, sb, b_sl);
+          std::vector<znx_i64> expect(sb);
+          for (uint64_t i = 0; i < sa; ++i) {
+            la.set(i, znx_i64::random_log2bound(n, 62));
+          }
+          for (uint64_t i = 0; i < sb; ++i) {
+            expect[i] = ref_rotate(p, la.get_copy_zext(i));
+          }
+          test_rotate(mod,                  //
+                      p,                    //
+                      lb.data(), sb, b_sl,  //
+                      la.data(), sa, a_sl   //
+          );
+          for (uint64_t i = 0; i < sb; ++i) {
+            ASSERT_EQ(lb.get_copy_zext(i), expect[i]) << n << " " << sa << " " << sb << " " << i;
+          }
+        }
+      }
+    }
+    delete_module_info(mod);
+  }
+}
+
+// test for inplace calls
+template <typename ACTUAL_FCN, typename EXPECT_FCN>
+void test_vec_znx_elemw_unop_param_inplace(ACTUAL_FCN test_rotate, EXPECT_FCN ref_rotate, int64_t (*param_gen)()) {
+  for (uint64_t n : {2, 16, 1024}) {
+    MODULE_TYPE mtype = uniform_u64() % 2 == 0 ? FFT64 : NTT120;
+    MODULE* mod = new_module_info(n, mtype);
+    for (uint64_t sa : {2, 6, 11}) {
+      {
+        int64_t p = param_gen();
+        uint64_t a_sl = uniform_u64_bits(3) * 5 + n;
+        znx_vec_i64_layout la(n, sa, a_sl);
+        std::vector<znx_i64> expect(sa);
+        for (uint64_t i = 0; i < sa; ++i) {
+          la.set(i, znx_i64::random_log2bound(n, 62));
+        }
+        for (uint64_t i = 0; i < sa; ++i) {
+          expect[i] = ref_rotate(p, la.get_copy_zext(i));
+        }
+        test_rotate(mod,                  // N
+                    p,                    //;
+                    la.data(), sa, a_sl,  // res
+                    la.data(), sa, a_sl   // a
+        );
+        for (uint64_t i = 0; i < sa; ++i) {
+          ASSERT_EQ(la.get_copy_zext(i), expect[i]) << n << " " << sa << " " << i;
+        }
+      }
+    }
+    delete_module_info(mod);
+  }
+}
+
+static int64_t random_rotate_param() { return uniform_i64(); }
+
+template <typename ACTUAL_FCN, typename EXPECT_FCN>
+void test_vec_znx_elemw_rotate(ACTUAL_FCN binop, EXPECT_FCN ref_binop) {
+  test_vec_znx_elemw_unop_param_outplace(binop, ref_binop, random_rotate_param);
+  test_vec_znx_elemw_unop_param_inplace(binop, ref_binop, random_rotate_param);
+}
+
+static znx_i64 poly_rotate(const int64_t p, const znx_i64& a) {
+  uint64_t n = a.nn();
+  znx_i64 res(n);
+  for (uint64_t i = 0; i < n; ++i) {
+    res.set_coeff(i, a.get_coeff(i - p));
+  }
+  return res;
+}
+TEST(vec_znx, vec_znx_rotate) { test_vec_znx_elemw_rotate(vec_znx_rotate, poly_rotate); }
+TEST(vec_znx, vec_znx_rotate_ref) { test_vec_znx_elemw_rotate(vec_znx_rotate_ref, poly_rotate); }
+
+static int64_t random_automorphism_param() { return uniform_i64() | 1; }
+
+template <typename ACTUAL_FCN, typename EXPECT_FCN>
+void test_vec_znx_elemw_automorphism(ACTUAL_FCN unop, EXPECT_FCN ref_unop) {
+  test_vec_znx_elemw_unop_param_outplace(unop, ref_unop, random_automorphism_param);
+  test_vec_znx_elemw_unop_param_inplace(unop, ref_unop, random_automorphism_param);
+}
+
+static znx_i64 poly_automorphism(const int64_t p, const znx_i64& a) {
+  uint64_t n = a.nn();
+  znx_i64 res(n);
+  for (uint64_t i = 0; i < n; ++i) {
+    res.set_coeff(i * p, a.get_coeff(i));
+  }
+  return res;
+}
+
+TEST(vec_znx, vec_znx_automorphism) { test_vec_znx_elemw_automorphism(vec_znx_automorphism, poly_automorphism); }
+TEST(vec_znx, vec_znx_automorphism_ref) {
+  test_vec_znx_elemw_automorphism(vec_znx_automorphism_ref, poly_automorphism);
+}
+
+// test for out of place calls
+template <typename ACTUAL_FCN, typename EXPECT_FCN>
+void test_vec_znx_elemw_unop_outplace(ACTUAL_FCN test_unop, EXPECT_FCN ref_unop) {
+  for (uint64_t n : {2, 4, 8, 128}) {
+    MODULE_TYPE mtype = uniform_u64() % 2 == 0 ? FFT64 : NTT120;
+    MODULE* mod = new_module_info(n, mtype);
+    for (uint64_t sa : {7, 13, 15}) {
+      for (uint64_t sb : {7, 13, 15}) {
+        {
+          uint64_t a_sl = uniform_u64_bits(3) * 5 + n;
+          uint64_t b_sl = uniform_u64_bits(3) * 4 + n;
+          znx_vec_i64_layout la(n, sa, a_sl);
+          znx_vec_i64_layout lb(n, sb, b_sl);
+          std::vector<znx_i64> expect(sb);
+          for (uint64_t i = 0; i < sa; ++i) {
+            la.set(i, znx_i64::random_log2bound(n, 62));
+          }
+          for (uint64_t i = 0; i < sb; ++i) {
+            expect[i] = ref_unop(la.get_copy_zext(i));
+          }
+          test_unop(mod,                  //
+                    lb.data(), sb, b_sl,  //
+                    la.data(), sa, a_sl   //
+          );
+          for (uint64_t i = 0; i < sb; ++i) {
+            ASSERT_EQ(lb.get_copy_zext(i), expect[i]) << n << " " << sa << " " << sb << " " << i;
+          }
+        }
+      }
+    }
+    delete_module_info(mod);
+  }
+}
+
+// test for inplace calls
+template <typename ACTUAL_FCN, typename EXPECT_FCN>
+void test_vec_znx_elemw_unop_inplace(ACTUAL_FCN test_unop, EXPECT_FCN ref_unop) {
+  for (uint64_t n : {2, 16, 1024}) {
+    MODULE_TYPE mtype = uniform_u64() % 2 == 0 ? FFT64 : NTT120;
+    MODULE* mod = new_module_info(n, mtype);
+    for (uint64_t sa : {2, 6, 11}) {
+      {
+        uint64_t a_sl = uniform_u64_bits(3) * 5 + n;
+        znx_vec_i64_layout la(n, sa, a_sl);
+        std::vector<znx_i64> expect(sa);
+        for (uint64_t i = 0; i < sa; ++i) {
+          la.set(i, znx_i64::random_log2bound(n, 62));
+        }
+        for (uint64_t i = 0; i < sa; ++i) {
+          expect[i] = ref_unop(la.get_copy_zext(i));
+        }
+        test_unop(mod,                  // N
+                  la.data(), sa, a_sl,  // res
+                  la.data(), sa, a_sl   // a
+        );
+        for (uint64_t i = 0; i < sa; ++i) {
+          ASSERT_EQ(la.get_copy_zext(i), expect[i]) << n << " " << sa << " " << i;
+        }
+      }
+    }
+    delete_module_info(mod);
+  }
+}
+
+template <typename ACTUAL_FCN, typename EXPECT_FCN>
+void test_vec_znx_elemw_unop(ACTUAL_FCN unop, EXPECT_FCN ref_unop) {
+  test_vec_znx_elemw_unop_outplace(unop, ref_unop);
+  test_vec_znx_elemw_unop_inplace(unop, ref_unop);
+}
+
+static znx_i64 poly_copy(const znx_i64& a) { return a; }
+
+TEST(vec_znx, vec_znx_copy) { test_vec_znx_elemw_unop(vec_znx_copy, poly_copy); }
+TEST(vec_znx, vec_znx_copy_ref) { test_vec_znx_elemw_unop(vec_znx_copy_ref, poly_copy); }
+
+static znx_i64 poly_negate(const znx_i64& a) { return -a; }
+
+TEST(vec_znx, vec_znx_negate) { test_vec_znx_elemw_unop(vec_znx_negate, poly_negate); }
+TEST(vec_znx, vec_znx_negate_ref) { test_vec_znx_elemw_unop(vec_znx_negate_ref, poly_negate); }
+#ifdef __x86_64__
+TEST(vec_znx, vec_znx_negate_avx) { test_vec_znx_elemw_unop(vec_znx_negate_avx, poly_negate); }
+#endif
+
+static void test_vec_znx_zero(VEC_ZNX_ZERO_F zero) {
+  for (uint64_t n : {2, 16, 1024}) {
+    MODULE_TYPE mtype = uniform_u64() % 2 == 0 ? FFT64 : NTT120;
+    MODULE* mod = new_module_info(n, mtype);
+    for (uint64_t sa : {2, 6, 11}) {
+      {
+        uint64_t a_sl = uniform_u64_bits(3) * 5 + n;
+        znx_vec_i64_layout la(n, sa, a_sl);
+        for (uint64_t i = 0; i < sa; ++i) {
+          la.set(i, znx_i64::random_log2bound(n, 62));
+        }
+        zero(mod,                 // N
+             la.data(), sa, a_sl  // res
+        );
+        znx_i64 ZERO = znx_i64::zero(n);
+        for (uint64_t i = 0; i < sa; ++i) {
+          ASSERT_EQ(la.get_copy_zext(i), ZERO) << n << " " << sa << " " << i;
+        }
+      }
+    }
+    delete_module_info(mod);
+  }
+}
+
+TEST(vec_znx, vec_znx_zero) { test_vec_znx_zero(vec_znx_zero); }
+TEST(vec_znx, vec_znx_zero_ref) { test_vec_znx_zero(vec_znx_zero_ref); }
+
+static void vec_poly_normalize(const uint64_t base_k, std::vector<znx_i64>& in) {
+  if (in.size() > 0) {
+    uint64_t n = in.front().nn();
+
+    znx_i64 out = znx_i64::random_log2bound(n, 62);
+    znx_i64 cinout(n);
+    for (int64_t i = in.size() - 1; i >= 0; --i) {
+      znx_normalize(n, base_k, in[i].data(), cinout.data(), in[i].data(), cinout.data());
+    }
+  }
+}
+
+template <typename ACTUAL_FCN, typename TMP_BYTES_FNC>
+void test_vec_znx_normalize_outplace(ACTUAL_FCN test_normalize, TMP_BYTES_FNC tmp_bytes) {
+  for (uint64_t n : {2, 16, 1024}) {
+    MODULE_TYPE mtype = uniform_u64() % 2 == 0 ? FFT64 : NTT120;
+    MODULE* mod = new_module_info(n, mtype);
+    for (uint64_t sa : {1, 2, 6, 11}) {
+      for (uint64_t sb : {1, 2, 6, 11}) {
+        for (uint64_t base_k : {19}) {
+          uint64_t a_sl = uniform_u64_bits(3) * 5 + n;
+          znx_vec_i64_layout la(n, sa, a_sl);
+          for (uint64_t i = 0; i < sa; ++i) {
+            la.set(i, znx_i64::random_log2bound(n, 62));
+          }
+
+          std::vector<znx_i64> la_norm;
+          for (uint64_t i = 0; i < sa; ++i) {
+            la_norm.push_back(la.get_copy_zext(i));
+          }
+          vec_poly_normalize(base_k, la_norm);
+
+          uint64_t b_sl = uniform_u64_bits(3) * 5 + n;
+          znx_vec_i64_layout lb(n, sb, b_sl);
+
+          const uint64_t tmp_size = tmp_bytes(mod);
+          uint8_t* tmp = new uint8_t[tmp_size];
+          test_normalize(mod,                  // N
+                         base_k,               // base_k
+                         lb.data(), sb, b_sl,  // res
+                         la.data(), sa, a_sl,  // a
+                         tmp);
+          delete[] tmp;
+
+          for (uint64_t i = 0; i < std::min(sa, sb); ++i) {
+            ASSERT_EQ(lb.get_copy_zext(i), la_norm[i]) << n << " " << sa << " " << sb << " " << i;
+          }
+          znx_i64 zero(n);
+          for (uint64_t i = std::min(sa, sb); i < sb; ++i) {
+            ASSERT_EQ(lb.get_copy_zext(i), zero) << n << " " << sa << " " << sb << " " << i;
+          }
+        }
+      }
+    }
+    delete_module_info(mod);
+  }
+}
+
+TEST(vec_znx, vec_znx_normalize_outplace) {
+  test_vec_znx_normalize_outplace(vec_znx_normalize_base2k, vec_znx_normalize_base2k_tmp_bytes);
+}
+TEST(vec_znx, vec_znx_normalize_outplace_ref) {
+  test_vec_znx_normalize_outplace(vec_znx_normalize_base2k_ref, vec_znx_normalize_base2k_tmp_bytes_ref);
+}
+
+template <typename ACTUAL_FCN, typename TMP_BYTES_FNC>
+void test_vec_znx_normalize_inplace(ACTUAL_FCN test_normalize, TMP_BYTES_FNC tmp_bytes) {
+  for (uint64_t n : {2, 16, 1024}) {
+    MODULE_TYPE mtype = uniform_u64() % 2 == 0 ? FFT64 : NTT120;
+    MODULE* mod = new_module_info(n, mtype);
+    for (uint64_t sa : {2, 6, 11}) {
+      for (uint64_t base_k : {19}) {
+        uint64_t a_sl = uniform_u64_bits(3) * 5 + n;
+        znx_vec_i64_layout la(n, sa, a_sl);
+        for (uint64_t i = 0; i < sa; ++i) {
+          la.set(i, znx_i64::random_log2bound(n, 62));
+        }
+
+        std::vector<znx_i64> la_norm;
+        for (uint64_t i = 0; i < sa; ++i) {
+          la_norm.push_back(la.get_copy_zext(i));
+        }
+        vec_poly_normalize(base_k, la_norm);
+
+        const uint64_t tmp_size = tmp_bytes(mod);
+        uint8_t* tmp = new uint8_t[tmp_size];
+        test_normalize(mod,                  // N
+                       base_k,               // base_k
+                       la.data(), sa, a_sl,  // res
+                       la.data(), sa, a_sl,  // a
+                       tmp);
+        delete[] tmp;
+        for (uint64_t i = 0; i < sa; ++i) {
+          ASSERT_EQ(la.get_copy_zext(i), la_norm[i]) << n << " " << sa << " " << i;
+        }
+      }
+    }
+    delete_module_info(mod);
+  }
+}
+
+TEST(vec_znx, vec_znx_normalize_inplace) {
+  test_vec_znx_normalize_inplace(vec_znx_normalize_base2k, vec_znx_normalize_base2k_tmp_bytes);
+}
+TEST(vec_znx, vec_znx_normalize_inplace_ref) {
+  test_vec_znx_normalize_inplace(vec_znx_normalize_base2k_ref, vec_znx_normalize_base2k_tmp_bytes_ref);
+}
--- a/spqlios/lib/test/spqlios_vmp_product_test.cpp
+++ b/spqlios/lib/test/spqlios_vmp_product_test.cpp
@@ -0,0 +1,121 @@
+#include <gtest/gtest.h>
+
+#include "../spqlios/arithmetic/vec_znx_arithmetic_private.h"
+#include "testlib/fft64_layouts.h"
+#include "testlib/polynomial_vector.h"
+
+static void test_vmp_prepare_contiguous(VMP_PREPARE_CONTIGUOUS_F* prepare_contiguous,
+                                        VMP_PREPARE_CONTIGUOUS_TMP_BYTES_F* tmp_bytes) {
+  // tests when n < 8
+  for (uint64_t nn : {2, 4}) {
+    MODULE* module = new_module_info(nn, FFT64);
+    for (uint64_t nrows : {1, 2, 5}) {
+      for (uint64_t ncols : {2, 6, 7}) {
+        znx_vec_i64_layout mat(nn, nrows * ncols, nn);
+        fft64_vmp_pmat_layout pmat(nn, nrows, ncols);
+        mat.fill_random(30);
+        std::vector<uint8_t> tmp_space(fft64_vmp_prepare_contiguous_tmp_bytes(module, nrows, ncols));
+        thash hash_before = mat.content_hash();
+        prepare_contiguous(module, pmat.data, mat.data(), nrows, ncols, tmp_space.data());
+        ASSERT_EQ(mat.content_hash(), hash_before);
+        for (uint64_t row = 0; row < nrows; ++row) {
+          for (uint64_t col = 0; col < ncols; ++col) {
+            const double* pmatv = (double*)pmat.data + (col * nrows + row) * nn;
+            reim_fft64vec tmp = simple_fft64(mat.get_copy(row * ncols + col));
+            const double* tmpv = tmp.data();
+            for (uint64_t i = 0; i < nn; ++i) {
+              ASSERT_LE(abs(pmatv[i] - tmpv[i]), 1e-10);
+            }
+          }
+        }
+      }
+    }
+    delete_module_info(module);
+  }
+  // tests when n >= 8
+  for (uint64_t nn : {8, 32}) {
+    MODULE* module = new_module_info(nn, FFT64);
+    uint64_t nblk = nn / 8;
+    for (uint64_t nrows : {1, 2, 5}) {
+      for (uint64_t ncols : {2, 6, 7}) {
+        znx_vec_i64_layout mat(nn, nrows * ncols, nn);
+        fft64_vmp_pmat_layout pmat(nn, nrows, ncols);
+        mat.fill_random(30);
+        std::vector<uint8_t> tmp_space(tmp_bytes(module, nrows, ncols));
+        thash hash_before = mat.content_hash();
+        prepare_contiguous(module, pmat.data, mat.data(), nrows, ncols, tmp_space.data());
+        ASSERT_EQ(mat.content_hash(), hash_before);
+        for (uint64_t row = 0; row < nrows; ++row) {
+          for (uint64_t col = 0; col < ncols; ++col) {
+            reim_fft64vec tmp = simple_fft64(mat.get_copy(row * ncols + col));
+            for (uint64_t blk = 0; blk < nblk; ++blk) {
+              reim4_elem expect = tmp.get_blk(blk);
+              reim4_elem actual = pmat.get(row, col, blk);
+              ASSERT_LE(infty_dist(actual, expect), 1e-10);
+            }
+          }
+        }
+      }
+    }
+    delete_module_info(module);
+  }
+}
+
+TEST(vec_znx, vmp_prepare_contiguous) {
+  test_vmp_prepare_contiguous(vmp_prepare_contiguous, vmp_prepare_contiguous_tmp_bytes);
+}
+TEST(vec_znx, fft64_vmp_prepare_contiguous_ref) {
+  test_vmp_prepare_contiguous(fft64_vmp_prepare_contiguous_ref, fft64_vmp_prepare_contiguous_tmp_bytes);
+}
+#ifdef __x86_64__
+TEST(vec_znx, fft64_vmp_prepare_contiguous_avx) {
+  test_vmp_prepare_contiguous(fft64_vmp_prepare_contiguous_avx, fft64_vmp_prepare_contiguous_tmp_bytes);
+}
+#endif
+
+static void test_vmp_apply(VMP_APPLY_DFT_TO_DFT_F* apply, VMP_APPLY_DFT_TO_DFT_TMP_BYTES_F* tmp_bytes) {
+  for (uint64_t nn : {2, 4, 8, 64}) {
+    MODULE* module = new_module_info(nn, FFT64);
+    for (uint64_t mat_nrows : {1, 4, 7}) {
+      for (uint64_t mat_ncols : {1, 2, 5}) {
+        for (uint64_t in_size : {1, 4, 7}) {
+          for (uint64_t out_size : {1, 2, 5}) {
+            fft64_vec_znx_dft_layout in(nn, in_size);
+            fft64_vmp_pmat_layout pmat(nn, mat_nrows, mat_ncols);
+            fft64_vec_znx_dft_layout out(nn, out_size);
+            in.fill_random(0);
+            pmat.fill_random(0);
+            // naive computation of the product
+            std::vector<reim_fft64vec> expect(out_size, reim_fft64vec(nn));
+            for (uint64_t col = 0; col < out_size; ++col) {
+              reim_fft64vec ex = reim_fft64vec::zero(nn);
+              for (uint64_t row = 0; row < std::min(mat_nrows, in_size); ++row) {
+                ex += pmat.get_zext(row, col) * in.get_copy_zext(row);
+              }
+              expect[col] = ex;
+            }
+            // apply the product
+            std::vector<uint8_t> tmp(tmp_bytes(module, out_size, in_size, mat_nrows, mat_ncols));
+            apply(module, out.data, out_size, in.data, in_size, pmat.data, mat_nrows, mat_ncols, tmp.data());
+            // check that the output is close from the expectation
+            for (uint64_t col = 0; col < out_size; ++col) {
+              reim_fft64vec actual = out.get_copy_zext(col);
+              ASSERT_LE(infty_dist(actual, expect[col]), 1e-10);
+            }
+          }
+        }
+      }
+    }
+    delete_module_info(module);
+  }
+}
+
+TEST(vec_znx, vmp_apply_to_dft) { test_vmp_apply(vmp_apply_dft_to_dft, vmp_apply_dft_to_dft_tmp_bytes); }
+TEST(vec_znx, fft64_vmp_apply_dft_to_dft_ref) {
+  test_vmp_apply(fft64_vmp_apply_dft_to_dft_ref, fft64_vmp_apply_dft_to_dft_tmp_bytes);
+}
+#ifdef __x86_64__
+TEST(vec_znx, fft64_vmp_apply_dft_to_dft_avx) {
+  test_vmp_apply(fft64_vmp_apply_dft_to_dft_avx, fft64_vmp_apply_dft_to_dft_tmp_bytes);
+}
+#endif
--- a/spqlios/lib/test/spqlios_zn_approxdecomp_test.cpp
+++ b/spqlios/lib/test/spqlios_zn_approxdecomp_test.cpp
@@ -0,0 +1,46 @@
+#include "gtest/gtest.h"
+#include "spqlios/arithmetic/zn_arithmetic_private.h"
+#include "testlib/test_commons.h"
+
+template <typename INTTYPE>
+static void test_tndbl_approxdecomp(                                                                                //
+    void (*approxdec)(const MOD_Z*, const TNDBL_APPROXDECOMP_GADGET*, INTTYPE*, uint64_t, const double*, uint64_t)  //
+) {
+  for (const uint64_t nn : {1, 3, 8, 51}) {
+    MOD_Z* module = new_z_module_info(DEFAULT);
+    for (const uint64_t ell : {1, 2, 7}) {
+      for (const uint64_t k : {2, 5}) {
+        TNDBL_APPROXDECOMP_GADGET* gadget = new_tndbl_approxdecomp_gadget(module, k, ell);
+        for (const uint64_t res_size : {ell * nn}) {
+          std::vector<double> in(nn);
+          std::vector<INTTYPE> out(res_size);
+          for (double& x : in) x = uniform_f64_bounds(-10, 10);
+          approxdec(module, gadget, out.data(), res_size, in.data(), nn);
+          // reconstruct the output
+          double err_bnd = pow(2., -double(ell * k) - 1);
+          for (uint64_t j = 0; j < nn; ++j) {
+            double in_j = in[j];
+            double out_j = 0;
+            for (uint64_t i = 0; i < ell; ++i) {
+              out_j += out[ell * j + i] * pow(2., -double((i + 1) * k));
+            }
+            double err = out_j - in_j;
+            double err_abs = fabs(err - rint(err));
+            ASSERT_LE(err_abs, err_bnd);
+          }
+        }
+        delete_tndbl_approxdecomp_gadget(gadget);
+      }
+    }
+    delete_z_module_info(module);
+  }
+}
+
+TEST(vec_rnx, i8_tndbl_rnx_approxdecomp) { test_tndbl_approxdecomp(i8_approxdecomp_from_tndbl); }
+TEST(vec_rnx, default_i8_tndbl_rnx_approxdecomp) { test_tndbl_approxdecomp(default_i8_approxdecomp_from_tndbl_ref); }
+
+TEST(vec_rnx, i16_tndbl_rnx_approxdecomp) { test_tndbl_approxdecomp(i16_approxdecomp_from_tndbl); }
+TEST(vec_rnx, default_i16_tndbl_rnx_approxdecomp) { test_tndbl_approxdecomp(default_i16_approxdecomp_from_tndbl_ref); }
+
+TEST(vec_rnx, i32_tndbl_rnx_approxdecomp) { test_tndbl_approxdecomp(i32_approxdecomp_from_tndbl); }
+TEST(vec_rnx, default_i32_tndbl_rnx_approxdecomp) { test_tndbl_approxdecomp(default_i32_approxdecomp_from_tndbl_ref); }
--- a/spqlios/lib/test/spqlios_zn_conversions_test.cpp
+++ b/spqlios/lib/test/spqlios_zn_conversions_test.cpp
@@ -0,0 +1,104 @@
+#include <gtest/gtest.h>
+#include <spqlios/arithmetic/zn_arithmetic_private.h>
+
+#include "testlib/test_commons.h"
+
+template <typename SRC_T, typename DST_T>
+static void test_conv(void (*conv_f)(const MOD_Z*, DST_T* res, uint64_t res_size, const SRC_T* a, uint64_t a_size),
+                      DST_T (*ideal_conv_f)(SRC_T x), SRC_T (*random_f)()) {
+  MOD_Z* module = new_z_module_info(DEFAULT);
+  for (uint64_t a_size : {0, 1, 2, 42}) {
+    for (uint64_t res_size : {0, 1, 2, 42}) {
+      for (uint64_t trials = 0; trials < 100; ++trials) {
+        std::vector<SRC_T> a(a_size);
+        std::vector<DST_T> res(res_size);
+        uint64_t msize = std::min(a_size, res_size);
+        for (SRC_T& x : a) x = random_f();
+        conv_f(module, res.data(), res_size, a.data(), a_size);
+        for (uint64_t i = 0; i < msize; ++i) {
+          DST_T expect = ideal_conv_f(a[i]);
+          DST_T actual = res[i];
+          ASSERT_EQ(expect, actual);
+        }
+        for (uint64_t i = msize; i < res_size; ++i) {
+          DST_T expect = 0;
+          SRC_T actual = res[i];
+          ASSERT_EQ(expect, actual);
+        }
+      }
+    }
+  }
+  delete_z_module_info(module);
+}
+
+static int32_t ideal_dbl_to_tn32(double a) {
+  double _2p32 = INT64_C(1) << 32;
+  double a_mod_1 = a - rint(a);
+  int64_t t = rint(a_mod_1 * _2p32);
+  return int32_t(t);
+}
+
+static double random_f64_10() { return uniform_f64_bounds(-10, 10); }
+
+static void test_dbl_to_tn32(DBL_TO_TN32_F dbl_to_tn32_f) {
+  test_conv(dbl_to_tn32_f, ideal_dbl_to_tn32, random_f64_10);
+}
+
+TEST(zn_arithmetic, dbl_to_tn32) { test_dbl_to_tn32(dbl_to_tn32); }
+TEST(zn_arithmetic, dbl_to_tn32_ref) { test_dbl_to_tn32(dbl_to_tn32_ref); }
+
+static double ideal_tn32_to_dbl(int32_t a) {
+  const double _2p32 = INT64_C(1) << 32;
+  return double(a) / _2p32;
+}
+
+static int32_t random_t32() { return uniform_i64_bits(32); }
+
+static void test_tn32_to_dbl(TN32_TO_DBL_F tn32_to_dbl_f) { test_conv(tn32_to_dbl_f, ideal_tn32_to_dbl, random_t32); }
+
+TEST(zn_arithmetic, tn32_to_dbl) { test_tn32_to_dbl(tn32_to_dbl); }
+TEST(zn_arithmetic, tn32_to_dbl_ref) { test_tn32_to_dbl(tn32_to_dbl_ref); }
+
+static int32_t ideal_dbl_round_to_i32(double a) { return int32_t(rint(a)); }
+
+static double random_dbl_explaw_18() { return uniform_f64_bounds(-1., 1.) * pow(2., uniform_u64_bits(6) % 19); }
+
+static void test_dbl_round_to_i32(DBL_ROUND_TO_I32_F dbl_round_to_i32_f) {
+  test_conv(dbl_round_to_i32_f, ideal_dbl_round_to_i32, random_dbl_explaw_18);
+}
+
+TEST(zn_arithmetic, dbl_round_to_i32) { test_dbl_round_to_i32(dbl_round_to_i32); }
+TEST(zn_arithmetic, dbl_round_to_i32_ref) { test_dbl_round_to_i32(dbl_round_to_i32_ref); }
+
+static double ideal_i32_to_dbl(int32_t a) { return double(a); }
+
+static int32_t random_i32_explaw_18() { return uniform_i64_bits(uniform_u64_bits(6) % 19); }
+
+static void test_i32_to_dbl(I32_TO_DBL_F i32_to_dbl_f) {
+  test_conv(i32_to_dbl_f, ideal_i32_to_dbl, random_i32_explaw_18);
+}
+
+TEST(zn_arithmetic, i32_to_dbl) { test_i32_to_dbl(i32_to_dbl); }
+TEST(zn_arithmetic, i32_to_dbl_ref) { test_i32_to_dbl(i32_to_dbl_ref); }
+
+static int64_t ideal_dbl_round_to_i64(double a) { return rint(a); }
+
+static double random_dbl_explaw_50() { return uniform_f64_bounds(-1., 1.) * pow(2., uniform_u64_bits(7) % 51); }
+
+static void test_dbl_round_to_i64(DBL_ROUND_TO_I64_F dbl_round_to_i64_f) {
+  test_conv(dbl_round_to_i64_f, ideal_dbl_round_to_i64, random_dbl_explaw_50);
+}
+
+TEST(zn_arithmetic, dbl_round_to_i64) { test_dbl_round_to_i64(dbl_round_to_i64); }
+TEST(zn_arithmetic, dbl_round_to_i64_ref) { test_dbl_round_to_i64(dbl_round_to_i64_ref); }
+
+static double ideal_i64_to_dbl(int64_t a) { return double(a); }
+
+static int64_t random_i64_explaw_50() { return uniform_i64_bits(uniform_u64_bits(7) % 51); }
+
+static void test_i64_to_dbl(I64_TO_DBL_F i64_to_dbl_f) {
+  test_conv(i64_to_dbl_f, ideal_i64_to_dbl, random_i64_explaw_50);
+}
+
+TEST(zn_arithmetic, i64_to_dbl) { test_i64_to_dbl(i64_to_dbl); }
+TEST(zn_arithmetic, i64_to_dbl_ref) { test_i64_to_dbl(i64_to_dbl_ref); }
--- a/spqlios/lib/test/spqlios_zn_vmp_test.cpp
+++ b/spqlios/lib/test/spqlios_zn_vmp_test.cpp
@@ -0,0 +1,67 @@
+#include "gtest/gtest.h"
+#include "spqlios/arithmetic/zn_arithmetic_private.h"
+#include "testlib/zn_layouts.h"
+
+static void test_zn_vmp_prepare(ZN32_VMP_PREPARE_CONTIGUOUS_F prep) {
+  MOD_Z* module = new_z_module_info(DEFAULT);
+  for (uint64_t nrows : {1, 2, 5, 15}) {
+    for (uint64_t ncols : {1, 2, 32, 42, 67}) {
+      std::vector<int32_t> src(nrows * ncols);
+      zn32_pmat_layout out(nrows, ncols);
+      for (int32_t& x : src) x = uniform_i64_bits(32);
+      prep(module, out.data, src.data(), nrows, ncols);
+      for (uint64_t i = 0; i < nrows; ++i) {
+        for (uint64_t j = 0; j < ncols; ++j) {
+          int32_t in = src[i * ncols + j];
+          int32_t actual = out.get(i, j);
+          ASSERT_EQ(actual, in);
+        }
+      }
+    }
+  }
+  delete_z_module_info(module);
+}
+
+TEST(zn, zn32_vmp_prepare_contiguous) { test_zn_vmp_prepare(zn32_vmp_prepare_contiguous); }
+TEST(zn, default_zn32_vmp_prepare_contiguous_ref) { test_zn_vmp_prepare(default_zn32_vmp_prepare_contiguous_ref); }
+
+template <typename INTTYPE>
+static void test_zn_vmp_apply(void (*apply)(const MOD_Z*, int32_t*, uint64_t, const INTTYPE*, uint64_t,
+                                            const ZN32_VMP_PMAT*, uint64_t, uint64_t)) {
+  MOD_Z* module = new_z_module_info(DEFAULT);
+  for (uint64_t nrows : {1, 2, 5, 15}) {
+    for (uint64_t ncols : {1, 2, 32, 42, 67}) {
+      for (uint64_t a_size : {1, 2, 5, 15}) {
+        for (uint64_t res_size : {1, 2, 32, 42, 67}) {
+          std::vector<INTTYPE> a(a_size);
+          zn32_pmat_layout out(nrows, ncols);
+          std::vector<int32_t> res(res_size);
+          for (INTTYPE& x : a) x = uniform_i64_bits(32);
+          out.fill_random();
+          std::vector<int32_t> expect = vmp_product(a.data(), a_size, res_size, out);
+          apply(module, res.data(), res_size, a.data(), a_size, out.data, nrows, ncols);
+          for (uint64_t i = 0; i < res_size; ++i) {
+            int32_t exp = expect[i];
+            int32_t actual = res[i];
+            ASSERT_EQ(actual, exp);
+          }
+        }
+      }
+    }
+  }
+  delete_z_module_info(module);
+}
+
+TEST(zn, zn32_vmp_apply_i32) { test_zn_vmp_apply(zn32_vmp_apply_i32); }
+TEST(zn, zn32_vmp_apply_i16) { test_zn_vmp_apply(zn32_vmp_apply_i16); }
+TEST(zn, zn32_vmp_apply_i8) { test_zn_vmp_apply(zn32_vmp_apply_i8); }
+
+TEST(zn, default_zn32_vmp_apply_i32_ref) { test_zn_vmp_apply(default_zn32_vmp_apply_i32_ref); }
+TEST(zn, default_zn32_vmp_apply_i16_ref) { test_zn_vmp_apply(default_zn32_vmp_apply_i16_ref); }
+TEST(zn, default_zn32_vmp_apply_i8_ref) { test_zn_vmp_apply(default_zn32_vmp_apply_i8_ref); }
+
+#ifdef __x86_64__
+TEST(zn, default_zn32_vmp_apply_i32_avx) { test_zn_vmp_apply(default_zn32_vmp_apply_i32_avx); }
+TEST(zn, default_zn32_vmp_apply_i16_avx) { test_zn_vmp_apply(default_zn32_vmp_apply_i16_avx); }
+TEST(zn, default_zn32_vmp_apply_i8_avx) { test_zn_vmp_apply(default_zn32_vmp_apply_i8_avx); }
+#endif
--- a/spqlios/lib/test/spqlios_znx_small_test.cpp
+++ b/spqlios/lib/test/spqlios_znx_small_test.cpp
@@ -0,0 +1,26 @@
+#include <gtest/gtest.h>
+
+#include "../spqlios/arithmetic/vec_znx_arithmetic_private.h"
+#include "testlib/negacyclic_polynomial.h"
+
+static void test_znx_small_single_product(ZNX_SMALL_SINGLE_PRODUCT_F product,
+                                          ZNX_SMALL_SINGLE_PRODUCT_TMP_BYTES_F product_tmp_bytes) {
+  for (const uint64_t nn : {2, 4, 8, 64}) {
+    MODULE* module = new_module_info(nn, FFT64);
+    znx_i64 a = znx_i64::random_log2bound(nn, 20);
+    znx_i64 b = znx_i64::random_log2bound(nn, 20);
+    znx_i64 expect = naive_product(a, b);
+    znx_i64 actual(nn);
+    std::vector<uint8_t> tmp(znx_small_single_product_tmp_bytes(module));
+    fft64_znx_small_single_product(module, actual.data(), a.data(), b.data(), tmp.data());
+    ASSERT_EQ(actual, expect) << actual.get_coeff(0) << " vs. " << expect.get_coeff(0);
+    delete_module_info(module);
+  }
+}
+
+TEST(znx_small, fft64_znx_small_single_product) {
+  test_znx_small_single_product(fft64_znx_small_single_product, fft64_znx_small_single_product_tmp_bytes);
+}
+TEST(znx_small, znx_small_single_product) {
+  test_znx_small_single_product(znx_small_single_product, znx_small_single_product_tmp_bytes);
+}
--- a/spqlios/lib/test/testlib/fft64_dft.cpp
+++ b/spqlios/lib/test/testlib/fft64_dft.cpp
@@ -0,0 +1,168 @@
+#include "fft64_dft.h"
+
+#include <cstring>
+
+#include "../../spqlios/reim/reim_fft.h"
+#include "../../spqlios/reim/reim_fft_internal.h"
+
+reim_fft64vec::reim_fft64vec(uint64_t n) : v(n, 0) {}
+reim4_elem reim_fft64vec::get_blk(uint64_t blk) const {
+  return reim_view(v.size() / 2, (double*)v.data()).get_blk(blk);
+}
+double* reim_fft64vec::data() { return v.data(); }
+const double* reim_fft64vec::data() const { return v.data(); }
+uint64_t reim_fft64vec::nn() const { return v.size(); }
+reim_fft64vec::reim_fft64vec(uint64_t n, const double* data) : v(data, data + n) {}
+void reim_fft64vec::save_as(double* dest) const { memcpy(dest, v.data(), nn() * sizeof(double)); }
+reim_fft64vec reim_fft64vec::zero(uint64_t n) { return reim_fft64vec(n); }
+void reim_fft64vec::set_blk(uint64_t blk, const reim4_elem& value) {
+  reim_view(v.size() / 2, (double*)v.data()).set_blk(blk, value);
+}
+reim_fft64vec reim_fft64vec::dft_random(uint64_t n, uint64_t log2bound) {
+  return simple_fft64(znx_i64::random_log2bound(n, log2bound));
+}
+reim_fft64vec reim_fft64vec::random(uint64_t n, double log2bound) {
+  double bound = pow(2., log2bound);
+  reim_fft64vec res(n);
+  for (uint64_t i = 0; i < n; ++i) {
+    res.v[i] = uniform_f64_bounds(-bound, bound);
+  }
+  return res;
+}
+
+reim_fft64vec operator+(const reim_fft64vec& a, const reim_fft64vec& b) {
+  uint64_t nn = a.nn();
+  REQUIRE_DRAMATICALLY(b.nn() == a.nn(), "ring dimension mismatch");
+  reim_fft64vec res(nn);
+  double* rv = res.data();
+  const double* av = a.data();
+  const double* bv = b.data();
+  for (uint64_t i = 0; i < nn; ++i) {
+    rv[i] = av[i] + bv[i];
+  }
+  return res;
+}
+reim_fft64vec operator-(const reim_fft64vec& a, const reim_fft64vec& b) {
+  uint64_t nn = a.nn();
+  REQUIRE_DRAMATICALLY(b.nn() == a.nn(), "ring dimension mismatch");
+  reim_fft64vec res(nn);
+  double* rv = res.data();
+  const double* av = a.data();
+  const double* bv = b.data();
+  for (uint64_t i = 0; i < nn; ++i) {
+    rv[i] = av[i] - bv[i];
+  }
+  return res;
+}
+reim_fft64vec operator*(const reim_fft64vec& a, const reim_fft64vec& b) {
+  uint64_t nn = a.nn();
+  REQUIRE_DRAMATICALLY(b.nn() == a.nn(), "ring dimension mismatch");
+  REQUIRE_DRAMATICALLY(nn >= 2, "test not defined for nn=1");
+  uint64_t m = nn / 2;
+  reim_fft64vec res(nn);
+  double* rv = res.data();
+  const double* av = a.data();
+  const double* bv = b.data();
+  for (uint64_t i = 0; i < m; ++i) {
+    rv[i] = av[i] * bv[i] - av[m + i] * bv[m + i];
+    rv[m + i] = av[i] * bv[m + i] + av[m + i] * bv[i];
+  }
+  return res;
+}
+reim_fft64vec& operator+=(reim_fft64vec& a, const reim_fft64vec& b) {
+  uint64_t nn = a.nn();
+  REQUIRE_DRAMATICALLY(b.nn() == a.nn(), "ring dimension mismatch");
+  double* av = a.data();
+  const double* bv = b.data();
+  for (uint64_t i = 0; i < nn; ++i) {
+    av[i] = av[i] + bv[i];
+  }
+  return a;
+}
+reim_fft64vec& operator-=(reim_fft64vec& a, const reim_fft64vec& b) {
+  uint64_t nn = a.nn();
+  REQUIRE_DRAMATICALLY(b.nn() == a.nn(), "ring dimension mismatch");
+  double* av = a.data();
+  const double* bv = b.data();
+  for (uint64_t i = 0; i < nn; ++i) {
+    av[i] = av[i] - bv[i];
+  }
+  return a;
+}
+
+reim_fft64vec simple_fft64(const znx_i64& polynomial) {
+  const uint64_t nn = polynomial.nn();
+  const uint64_t m = nn / 2;
+  reim_fft64vec res(nn);
+  double* dat = res.data();
+  for (uint64_t i = 0; i < nn; ++i) dat[i] = polynomial.get_coeff(i);
+  reim_fft_simple(m, dat);
+  return res;
+}
+
+znx_i64 simple_rint_ifft64(const reim_fft64vec& fftvec) {
+  const uint64_t nn = fftvec.nn();
+  const uint64_t m = nn / 2;
+  std::vector<double> vv(fftvec.data(), fftvec.data() + nn);
+  double* v = vv.data();
+  reim_ifft_simple(m, v);
+  znx_i64 res(nn);
+  for (uint64_t i = 0; i < nn; ++i) {
+    res.set_coeff(i, rint(v[i] / m));
+  }
+  return res;
+}
+
+rnx_f64 naive_ifft64(const reim_fft64vec& fftvec) {
+  const uint64_t nn = fftvec.nn();
+  const uint64_t m = nn / 2;
+  std::vector<double> vv(fftvec.data(), fftvec.data() + nn);
+  double* v = vv.data();
+  reim_ifft_simple(m, v);
+  rnx_f64 res(nn);
+  for (uint64_t i = 0; i < nn; ++i) {
+    res.set_coeff(i, v[i] / m);
+  }
+  return res;
+}
+double infty_dist(const reim_fft64vec& a, const reim_fft64vec& b) {
+  const uint64_t n = a.nn();
+  REQUIRE_DRAMATICALLY(b.nn() == a.nn(), "dimensions mismatch");
+  const double* da = a.data();
+  const double* db = b.data();
+  double d = 0;
+  for (uint64_t i = 0; i < n; ++i) {
+    double di = abs(da[i] - db[i]);
+    if (di > d) d = di;
+  }
+  return d;
+}
+
+reim_fft64vec simple_fft64(const rnx_f64& polynomial) {
+  const uint64_t nn = polynomial.nn();
+  const uint64_t m = nn / 2;
+  reim_fft64vec res(nn);
+  double* dat = res.data();
+  for (uint64_t i = 0; i < nn; ++i) dat[i] = polynomial.get_coeff(i);
+  reim_fft_simple(m, dat);
+  return res;
+}
+
+reim_fft64vec operator*(double coeff, const reim_fft64vec& v) {
+  const uint64_t nn = v.nn();
+  reim_fft64vec res(nn);
+  double* rr = res.data();
+  const double* vv = v.data();
+  for (uint64_t i = 0; i < nn; ++i) rr[i] = coeff * vv[i];
+  return res;
+}
+
+rnx_f64 simple_ifft64(const reim_fft64vec& v) {
+  const uint64_t nn = v.nn();
+  const uint64_t m = nn / 2;
+  rnx_f64 res(nn);
+  double* dat = res.data();
+  memcpy(dat, v.data(), nn * sizeof(double));
+  reim_ifft_simple(m, dat);
+  return res;
+}
--- a/spqlios/lib/test/testlib/fft64_dft.h
+++ b/spqlios/lib/test/testlib/fft64_dft.h
@@ -0,0 +1,43 @@
+#ifndef SPQLIOS_FFT64_DFT_H
+#define SPQLIOS_FFT64_DFT_H
+
+#include "negacyclic_polynomial.h"
+#include "reim4_elem.h"
+
+class reim_fft64vec {
+  std::vector<double> v;
+
+ public:
+  reim_fft64vec() = default;
+  explicit reim_fft64vec(uint64_t n);
+  reim_fft64vec(uint64_t n, const double* data);
+  uint64_t nn() const;
+  static reim_fft64vec zero(uint64_t n);
+  /** random complex coefficients (unstructured) */
+  static reim_fft64vec random(uint64_t n, double log2bound);
+  /** random fft of a small int polynomial */
+  static reim_fft64vec dft_random(uint64_t n, uint64_t log2bound);
+  double* data();
+  const double* data() const;
+  void save_as(double* dest) const;
+  reim4_elem get_blk(uint64_t blk) const;
+  void set_blk(uint64_t blk, const reim4_elem& value);
+};
+
+reim_fft64vec operator+(const reim_fft64vec& a, const reim_fft64vec& b);
+reim_fft64vec operator-(const reim_fft64vec& a, const reim_fft64vec& b);
+reim_fft64vec operator*(const reim_fft64vec& a, const reim_fft64vec& b);
+reim_fft64vec operator*(double coeff, const reim_fft64vec& v);
+reim_fft64vec& operator+=(reim_fft64vec& a, const reim_fft64vec& b);
+reim_fft64vec& operator-=(reim_fft64vec& a, const reim_fft64vec& b);
+
+/** infty distance */
+double infty_dist(const reim_fft64vec& a, const reim_fft64vec& b);
+
+reim_fft64vec simple_fft64(const znx_i64& polynomial);
+znx_i64 simple_rint_ifft64(const reim_fft64vec& fftvec);
+rnx_f64 naive_ifft64(const reim_fft64vec& fftvec);
+reim_fft64vec simple_fft64(const rnx_f64& polynomial);
+rnx_f64 simple_ifft64(const reim_fft64vec& v);
+
+#endif  // SPQLIOS_FFT64_DFT_H
--- a/spqlios/lib/test/testlib/fft64_layouts.cpp
+++ b/spqlios/lib/test/testlib/fft64_layouts.cpp
@@ -0,0 +1,238 @@
+#include "fft64_layouts.h"
+#ifdef VALGRIND_MEM_TESTS
+#include "valgrind/memcheck.h"
+#endif
+
+void* alloc64(uint64_t size) {
+  static uint64_t _msk64 = -64;
+  if (size == 0) return nullptr;
+  uint64_t rsize = (size + 63) & _msk64;
+  uint8_t* reps = (uint8_t*)spqlios_alloc(rsize);
+  REQUIRE_DRAMATICALLY(reps != 0, "Out of memory");
+#ifdef VALGRIND_MEM_TESTS
+  VALGRIND_MAKE_MEM_NOACCESS(reps + size, rsize - size);
+#endif
+  return reps;
+}
+
+fft64_vec_znx_dft_layout::fft64_vec_znx_dft_layout(uint64_t n, uint64_t size)
+    : nn(n),                                      //
+      size(size),                                 //
+      data((VEC_ZNX_DFT*)alloc64(n * size * 8)),  //
+      view(n / 2, size, (double*)data) {}
+
+fft64_vec_znx_dft_layout::~fft64_vec_znx_dft_layout() { spqlios_free(data); }
+
+double* fft64_vec_znx_dft_layout::get_addr(uint64_t idx) {
+  REQUIRE_DRAMATICALLY(idx < size, "index overflow " << idx << " / " << size);
+  return ((double*)data) + idx * nn;
+}
+const double* fft64_vec_znx_dft_layout::get_addr(uint64_t idx) const {
+  REQUIRE_DRAMATICALLY(idx < size, "index overflow " << idx << " / " << size);
+  return ((double*)data) + idx * nn;
+}
+reim_fft64vec fft64_vec_znx_dft_layout::get_copy_zext(uint64_t idx) const {
+  if (idx < size) {
+    return reim_fft64vec(nn, get_addr(idx));
+  } else {
+    return reim_fft64vec::zero(nn);
+  }
+}
+void fft64_vec_znx_dft_layout::fill_dft_random_log2bound(uint64_t bits) {
+  for (uint64_t i = 0; i < size; ++i) {
+    set(i, simple_fft64(znx_i64::random_log2bound(nn, bits)));
+  }
+}
+void fft64_vec_znx_dft_layout::set(uint64_t idx, const reim_fft64vec& value) {
+  REQUIRE_DRAMATICALLY(value.nn() == nn, "ring dimension mismatch");
+  value.save_as(get_addr(idx));
+}
+thash fft64_vec_znx_dft_layout::content_hash() const { return test_hash(data, size * nn * sizeof(double)); }
+
+reim4_elem fft64_vec_znx_dft_layout::get(uint64_t idx, uint64_t blk) const {
+  REQUIRE_DRAMATICALLY(idx < size, "index overflow: " << idx << " / " << size);
+  REQUIRE_DRAMATICALLY(blk < nn / 8, "blk overflow: " << blk << " / " << nn / 8);
+  double* reim = ((double*)data) + idx * nn;
+  return reim4_elem(reim + blk * 4, reim + nn / 2 + blk * 4);
+}
+reim4_elem fft64_vec_znx_dft_layout::get_zext(uint64_t idx, uint64_t blk) const {
+  REQUIRE_DRAMATICALLY(blk < nn / 8, "blk overflow: " << blk << " / " << nn / 8);
+  if (idx < size) {
+    return get(idx, blk);
+  } else {
+    return reim4_elem::zero();
+  }
+}
+void fft64_vec_znx_dft_layout::set(uint64_t idx, uint64_t blk, const reim4_elem& value) {
+  REQUIRE_DRAMATICALLY(idx < size, "index overflow: " << idx << " / " << size);
+  REQUIRE_DRAMATICALLY(blk < nn / 8, "blk overflow: " << blk << " / " << nn / 8);
+  double* reim = ((double*)data) + idx * nn;
+  value.save_re_im(reim + blk * 4, reim + nn / 2 + blk * 4);
+}
+void fft64_vec_znx_dft_layout::fill_random(double log2bound) {
+  for (uint64_t i = 0; i < size; ++i) {
+    set(i, reim_fft64vec::random(nn, log2bound));
+  }
+}
+void fft64_vec_znx_dft_layout::fill_dft_random(uint64_t log2bound) {
+  for (uint64_t i = 0; i < size; ++i) {
+    set(i, reim_fft64vec::dft_random(nn, log2bound));
+  }
+}
+
+fft64_vec_znx_big_layout::fft64_vec_znx_big_layout(uint64_t n, uint64_t size)
+    : nn(n),       //
+      size(size),  //
+      data((VEC_ZNX_BIG*)alloc64(n * size * 8)) {}
+
+znx_i64 fft64_vec_znx_big_layout::get_copy(uint64_t index) const {
+  REQUIRE_DRAMATICALLY(index < size, "index overflow: " << index << " / " << size);
+  return znx_i64(nn, ((int64_t*)data) + index * nn);
+}
+znx_i64 fft64_vec_znx_big_layout::get_copy_zext(uint64_t index) const {
+  if (index < size) {
+    return znx_i64(nn, ((int64_t*)data) + index * nn);
+  } else {
+    return znx_i64::zero(nn);
+  }
+}
+void fft64_vec_znx_big_layout::set(uint64_t index, const znx_i64& value) {
+  REQUIRE_DRAMATICALLY(index < size, "index overflow: " << index << " / " << size);
+  value.save_as(((int64_t*)data) + index * nn);
+}
+void fft64_vec_znx_big_layout::fill_random() {
+  for (uint64_t i = 0; i < size; ++i) {
+    set(i, znx_i64::random_log2bound(nn, 1));
+  }
+}
+fft64_vec_znx_big_layout::~fft64_vec_znx_big_layout() { spqlios_free(data); }
+
+fft64_vmp_pmat_layout::fft64_vmp_pmat_layout(uint64_t n, uint64_t nrows, uint64_t ncols)
+    : nn(n),
+      nrows(nrows),
+      ncols(ncols),  //
+      data((VMP_PMAT*)alloc64(nrows * ncols * nn * 8)) {}
+
+double* fft64_vmp_pmat_layout::get_addr(uint64_t row, uint64_t col, uint64_t blk) const {
+  REQUIRE_DRAMATICALLY(row < nrows, "row overflow: " << row << " / " << nrows);
+  REQUIRE_DRAMATICALLY(col < ncols, "col overflow: " << col << " / " << ncols);
+  REQUIRE_DRAMATICALLY(blk < nn / 8, "block overflow: " << blk << " / " << (nn / 8));
+  double* d = (double*)data;
+  if (col == (ncols - 1) && (ncols % 2 == 1)) {
+    // special case: last column out of an odd column number
+    return d + blk * nrows * ncols * 8  // major: blk
+           + col * nrows * 8            // col == ncols-1
+           + row * 8;
+  } else {
+    // general case: columns go by pair
+    return d + blk * nrows * ncols * 8    // major: blk
+           + (col / 2) * (2 * nrows) * 8  // second: col pair index
+           + row * 2 * 8                  // third: row index
+           + (col % 2) * 8;               // minor: col in colpair
+  }
+}
+
+reim4_elem fft64_vmp_pmat_layout::get(uint64_t row, uint64_t col, uint64_t blk) const {
+  return reim4_elem(get_addr(row, col, blk));
+}
+reim4_elem fft64_vmp_pmat_layout::get_zext(uint64_t row, uint64_t col, uint64_t blk) const {
+  REQUIRE_DRAMATICALLY(blk < nn / 8, "block overflow: " << blk << " / " << (nn / 8));
+  if (row < nrows && col < ncols) {
+    return reim4_elem(get_addr(row, col, blk));
+  } else {
+    return reim4_elem::zero();
+  }
+}
+void fft64_vmp_pmat_layout::set(uint64_t row, uint64_t col, uint64_t blk, const reim4_elem& value) const {
+  value.save_as(get_addr(row, col, blk));
+}
+
+fft64_vmp_pmat_layout::~fft64_vmp_pmat_layout() { spqlios_free(data); }
+
+reim_fft64vec fft64_vmp_pmat_layout::get_zext(uint64_t row, uint64_t col) const {
+  if (row >= nrows || col >= ncols) {
+    return reim_fft64vec::zero(nn);
+  }
+  if (nn < 8) {
+    // the pmat is just col major
+    double* addr = (double*)data + (row + col * nrows) * nn;
+    return reim_fft64vec(nn, addr);
+  }
+  // otherwise, reconstruct it block by block
+  reim_fft64vec res(nn);
+  for (uint64_t blk = 0; blk < nn / 8; ++blk) {
+    reim4_elem v = get(row, col, blk);
+    res.set_blk(blk, v);
+  }
+  return res;
+}
+void fft64_vmp_pmat_layout::set(uint64_t row, uint64_t col, const reim_fft64vec& value) {
+  REQUIRE_DRAMATICALLY(row < nrows, "row overflow: " << row << " / " << nrows);
+  REQUIRE_DRAMATICALLY(col < ncols, "row overflow: " << col << " / " << ncols);
+  if (nn < 8) {
+    // the pmat is just col major
+    double* addr = (double*)data + (row + col * nrows) * nn;
+    value.save_as(addr);
+    return;
+  }
+  // otherwise, reconstruct it block by block
+  for (uint64_t blk = 0; blk < nn / 8; ++blk) {
+    reim4_elem v = value.get_blk(blk);
+    set(row, col, blk, v);
+  }
+}
+void fft64_vmp_pmat_layout::fill_random(double log2bound) {
+  for (uint64_t row = 0; row < nrows; ++row) {
+    for (uint64_t col = 0; col < ncols; ++col) {
+      set(row, col, reim_fft64vec::random(nn, log2bound));
+    }
+  }
+}
+void fft64_vmp_pmat_layout::fill_dft_random(uint64_t log2bound) {
+  for (uint64_t row = 0; row < nrows; ++row) {
+    for (uint64_t col = 0; col < ncols; ++col) {
+      set(row, col, reim_fft64vec::dft_random(nn, log2bound));
+    }
+  }
+}
+
+fft64_svp_ppol_layout::fft64_svp_ppol_layout(uint64_t n)
+    : nn(n),  //
+      data((SVP_PPOL*)alloc64(nn * 8)) {}
+
+reim_fft64vec fft64_svp_ppol_layout::get_copy() const { return reim_fft64vec(nn, (double*)data); }
+
+void fft64_svp_ppol_layout::set(const reim_fft64vec& value) { value.save_as((double*)data); }
+
+void fft64_svp_ppol_layout::fill_dft_random(uint64_t log2bound) { set(reim_fft64vec::dft_random(nn, log2bound)); }
+
+void fft64_svp_ppol_layout::fill_random(double log2bound) { set(reim_fft64vec::random(nn, log2bound)); }
+
+fft64_svp_ppol_layout::~fft64_svp_ppol_layout() { spqlios_free(data); }
+thash fft64_svp_ppol_layout::content_hash() const { return test_hash(data, nn * sizeof(double)); }
+
+fft64_cnv_left_layout::fft64_cnv_left_layout(uint64_t n, uint64_t size)
+    : nn(n),  //
+      size(size),
+      data((CNV_PVEC_L*)alloc64(size * nn * 8)) {}
+
+reim4_elem fft64_cnv_left_layout::get(uint64_t idx, uint64_t blk) {
+  REQUIRE_DRAMATICALLY(idx < size, "idx overflow: " << idx << " / " << size);
+  REQUIRE_DRAMATICALLY(blk < nn / 8, "block overflow: " << blk << " / " << (nn / 8));
+  return reim4_elem(((double*)data) + blk * size + idx);
+}
+
+fft64_cnv_left_layout::~fft64_cnv_left_layout() { spqlios_free(data); }
+
+fft64_cnv_right_layout::fft64_cnv_right_layout(uint64_t n, uint64_t size)
+    : nn(n),  //
+      size(size),
+      data((CNV_PVEC_R*)alloc64(size * nn * 8)) {}
+
+reim4_elem fft64_cnv_right_layout::get(uint64_t idx, uint64_t blk) {
+  REQUIRE_DRAMATICALLY(idx < size, "idx overflow: " << idx << " / " << size);
+  REQUIRE_DRAMATICALLY(blk < nn / 8, "block overflow: " << blk << " / " << (nn / 8));
+  return reim4_elem(((double*)data) + blk * size + idx);
+}
+
+fft64_cnv_right_layout::~fft64_cnv_right_layout() { spqlios_free(data); }
--- a/spqlios/lib/test/testlib/fft64_layouts.h
+++ b/spqlios/lib/test/testlib/fft64_layouts.h
@@ -0,0 +1,109 @@
+#ifndef SPQLIOS_FFT64_LAYOUTS_H
+#define SPQLIOS_FFT64_LAYOUTS_H
+
+#include "../../spqlios/arithmetic/vec_znx_arithmetic.h"
+#include "fft64_dft.h"
+#include "negacyclic_polynomial.h"
+#include "reim4_elem.h"
+
+/** @brief test layout for the VEC_ZNX_DFT */
+struct fft64_vec_znx_dft_layout {
+ public:
+  const uint64_t nn;
+  const uint64_t size;
+  VEC_ZNX_DFT* const data;
+  reim_vector_view view;
+  /** @brief fill with random double values (unstructured) */
+  void fill_random(double log2bound);
+  /** @brief fill with random ffts of small int polynomials */
+  void fill_dft_random(uint64_t log2bound);
+  reim4_elem get(uint64_t idx, uint64_t blk) const;
+  reim4_elem get_zext(uint64_t idx, uint64_t blk) const;
+  void set(uint64_t idx, uint64_t blk, const reim4_elem& value);
+  fft64_vec_znx_dft_layout(uint64_t n, uint64_t size);
+  void fill_random_log2bound(uint64_t bits);
+  void fill_dft_random_log2bound(uint64_t bits);
+  double* get_addr(uint64_t idx);
+  const double* get_addr(uint64_t idx) const;
+  reim_fft64vec get_copy_zext(uint64_t idx) const;
+  void set(uint64_t idx, const reim_fft64vec& value);
+  thash content_hash() const;
+  ~fft64_vec_znx_dft_layout();
+};
+
+/** @brief test layout for the VEC_ZNX_BIG */
+class fft64_vec_znx_big_layout {
+ public:
+  const uint64_t nn;
+  const uint64_t size;
+  VEC_ZNX_BIG* const data;
+  fft64_vec_znx_big_layout(uint64_t n, uint64_t size);
+  void fill_random();
+  znx_i64 get_copy(uint64_t index) const;
+  znx_i64 get_copy_zext(uint64_t index) const;
+  void set(uint64_t index, const znx_i64& value);
+  thash content_hash() const;
+  ~fft64_vec_znx_big_layout();
+};
+
+/** @brief test layout for the VMP_PMAT */
+class fft64_vmp_pmat_layout {
+ public:
+  const uint64_t nn;
+  const uint64_t nrows;
+  const uint64_t ncols;
+  VMP_PMAT* const data;
+  fft64_vmp_pmat_layout(uint64_t n, uint64_t nrows, uint64_t ncols);
+  double* get_addr(uint64_t row, uint64_t col, uint64_t blk) const;
+  reim4_elem get(uint64_t row, uint64_t col, uint64_t blk) const;
+  thash content_hash() const;
+  reim4_elem get_zext(uint64_t row, uint64_t col, uint64_t blk) const;
+  reim_fft64vec get_zext(uint64_t row, uint64_t col) const;
+  void set(uint64_t row, uint64_t col, uint64_t blk, const reim4_elem& v) const;
+  void set(uint64_t row, uint64_t col, const reim_fft64vec& value);
+  /** @brief fill with random double values (unstructured) */
+  void fill_random(double log2bound);
+  /** @brief fill with random ffts of small int polynomials */
+  void fill_dft_random(uint64_t log2bound);
+  ~fft64_vmp_pmat_layout();
+};
+
+/** @brief test layout for the SVP_PPOL */
+class fft64_svp_ppol_layout {
+ public:
+  const uint64_t nn;
+  SVP_PPOL* const data;
+  fft64_svp_ppol_layout(uint64_t n);
+  thash content_hash() const;
+  reim_fft64vec get_copy() const;
+  void set(const reim_fft64vec&);
+  /** @brief fill with random double values (unstructured) */
+  void fill_random(double log2bound);
+  /** @brief fill with random ffts of small int polynomials */
+  void fill_dft_random(uint64_t log2bound);
+  ~fft64_svp_ppol_layout();
+};
+
+/** @brief test layout for the CNV_PVEC_L */
+class fft64_cnv_left_layout {
+  const uint64_t nn;
+  const uint64_t size;
+  CNV_PVEC_L* const data;
+  fft64_cnv_left_layout(uint64_t n, uint64_t size);
+  reim4_elem get(uint64_t idx, uint64_t blk);
+  thash content_hash() const;
+  ~fft64_cnv_left_layout();
+};
+
+/** @brief test layout for the CNV_PVEC_R */
+class fft64_cnv_right_layout {
+  const uint64_t nn;
+  const uint64_t size;
+  CNV_PVEC_R* const data;
+  fft64_cnv_right_layout(uint64_t n, uint64_t size);
+  reim4_elem get(uint64_t idx, uint64_t blk);
+  thash content_hash() const;
+  ~fft64_cnv_right_layout();
+};
+
+#endif  // SPQLIOS_FFT64_LAYOUTS_H
--- a/spqlios/lib/test/testlib/mod_q120.cpp
+++ b/spqlios/lib/test/testlib/mod_q120.cpp
@@ -0,0 +1,229 @@
+#include "mod_q120.h"
+
+#include <cstdint>
+#include <random>
+
+int64_t centermod(int64_t v, int64_t q) {
+  int64_t t = v % q;
+  if (t >= (q + 1) / 2) return t - q;
+  if (t < -q / 2) return t + q;
+  return t;
+}
+
+int64_t centermod(uint64_t v, int64_t q) {
+  int64_t t = int64_t(v % uint64_t(q));
+  if (t >= q / 2) return t - q;
+  return t;
+}
+
+mod_q120::mod_q120() {
+  for (uint64_t i = 0; i < 4; ++i) {
+    a[i] = 0;
+  }
+}
+
+mod_q120::mod_q120(int64_t a0, int64_t a1, int64_t a2, int64_t a3) {
+  a[0] = centermod(a0, Qi[0]);
+  a[1] = centermod(a1, Qi[1]);
+  a[2] = centermod(a2, Qi[2]);
+  a[3] = centermod(a3, Qi[3]);
+}
+
+mod_q120 operator+(const mod_q120& x, const mod_q120& y) {
+  mod_q120 r;
+  for (uint64_t i = 0; i < 4; ++i) {
+    r.a[i] = centermod(x.a[i] + y.a[i], mod_q120::Qi[i]);
+  }
+  return r;
+}
+
+mod_q120 operator-(const mod_q120& x, const mod_q120& y) {
+  mod_q120 r;
+  for (uint64_t i = 0; i < 4; ++i) {
+    r.a[i] = centermod(x.a[i] - y.a[i], mod_q120::Qi[i]);
+  }
+  return r;
+}
+
+mod_q120 operator*(const mod_q120& x, const mod_q120& y) {
+  mod_q120 r;
+  for (uint64_t i = 0; i < 4; ++i) {
+    r.a[i] = centermod(x.a[i] * y.a[i], mod_q120::Qi[i]);
+  }
+  return r;
+}
+
+mod_q120& operator+=(mod_q120& x, const mod_q120& y) {
+  for (uint64_t i = 0; i < 4; ++i) {
+    x.a[i] = centermod(x.a[i] + y.a[i], mod_q120::Qi[i]);
+  }
+  return x;
+}
+
+mod_q120& operator-=(mod_q120& x, const mod_q120& y) {
+  for (uint64_t i = 0; i < 4; ++i) {
+    x.a[i] = centermod(x.a[i] - y.a[i], mod_q120::Qi[i]);
+  }
+  return x;
+}
+
+mod_q120& operator*=(mod_q120& x, const mod_q120& y) {
+  for (uint64_t i = 0; i < 4; ++i) {
+    x.a[i] = centermod(x.a[i] * y.a[i], mod_q120::Qi[i]);
+  }
+  return x;
+}
+
+int64_t modq_pow(int64_t x, int32_t k, int64_t q) {
+  k = (k % (q - 1) + q - 1) % (q - 1);
+
+  int64_t res = 1;
+  int64_t x_pow = centermod(x, q);
+  while (k != 0) {
+    if (k & 1) res = centermod(res * x_pow, q);
+    x_pow = centermod(x_pow * x_pow, q);
+    k >>= 1;
+  }
+  return res;
+}
+
+mod_q120 pow(const mod_q120& x, int32_t k) {
+  const int64_t r0 = modq_pow(x.a[0], k, x.Qi[0]);
+  const int64_t r1 = modq_pow(x.a[1], k, x.Qi[1]);
+  const int64_t r2 = modq_pow(x.a[2], k, x.Qi[2]);
+  const int64_t r3 = modq_pow(x.a[3], k, x.Qi[3]);
+  return mod_q120{r0, r1, r2, r3};
+}
+
+static int64_t half_modq(int64_t x, int64_t q) {
+  // q must be odd in this function
+  if (x % 2 == 0) return x / 2;
+  return centermod((x + q) / 2, q);
+}
+
+mod_q120 half(const mod_q120& x) {
+  const int64_t r0 = half_modq(x.a[0], x.Qi[0]);
+  const int64_t r1 = half_modq(x.a[1], x.Qi[1]);
+  const int64_t r2 = half_modq(x.a[2], x.Qi[2]);
+  const int64_t r3 = half_modq(x.a[3], x.Qi[3]);
+  return mod_q120{r0, r1, r2, r3};
+}
+
+bool operator==(const mod_q120& x, const mod_q120& y) {
+  for (uint64_t i = 0; i < 4; ++i) {
+    if (x.a[i] != y.a[i]) return false;
+  }
+  return true;
+}
+
+std::ostream& operator<<(std::ostream& out, const mod_q120& x) {
+  return out << "q120{" << x.a[0] << "," << x.a[1] << "," << x.a[2] << "," << x.a[3] << "}";
+}
+
+mod_q120 mod_q120::from_q120a(const void* addr) {
+  static const uint64_t _2p32 = UINT64_C(1) << 32;
+  const uint64_t* in = (const uint64_t*)addr;
+  mod_q120 r;
+  for (uint64_t i = 0; i < 4; ++i) {
+    REQUIRE_DRAMATICALLY(in[i] < _2p32, "invalid layout a q120");
+    r.a[i] = centermod(in[i], mod_q120::Qi[i]);
+  }
+  return r;
+}
+
+mod_q120 mod_q120::from_q120b(const void* addr) {
+  const uint64_t* in = (const uint64_t*)addr;
+  mod_q120 r;
+  for (uint64_t i = 0; i < 4; ++i) {
+    r.a[i] = centermod(in[i], mod_q120::Qi[i]);
+  }
+  return r;
+}
+
+mod_q120 mod_q120::from_q120c(const void* addr) {
+  //static const uint64_t _mask_2p32 = (uint64_t(1) << 32) - 1;
+  const uint32_t* in = (const uint32_t*)addr;
+  mod_q120 r;
+  for (uint64_t i = 0, k = 0; i < 8; i += 2, ++k) {
+    const uint64_t q = mod_q120::Qi[k];
+    uint64_t u = in[i];
+    uint64_t w = in[i + 1];
+    REQUIRE_DRAMATICALLY(((u << 32) % q) == (w % q),
+                         "invalid layout q120c: " << u << ".2^32 != " << (w >> 32) << " mod " << q);
+    r.a[k] = centermod(u, q);
+  }
+  return r;
+}
+__int128_t mod_q120::to_int128() const {
+  static const __int128_t qm[] = {(__int128_t(Qi[1]) * Qi[2]) * Qi[3], (__int128_t(Qi[0]) * Qi[2]) * Qi[3],
+                                  (__int128_t(Qi[0]) * Qi[1]) * Qi[3], (__int128_t(Qi[0]) * Qi[1]) * Qi[2]};
+  static const int64_t CRTi[] = {Q1_CRT_CST, Q2_CRT_CST, Q3_CRT_CST, Q4_CRT_CST};
+  static const __int128_t q = qm[0] * Qi[0];
+  static const __int128_t qs2 = q / 2;
+  __int128_t res = 0;
+  for (uint64_t i = 0; i < 4; ++i) {
+    res += (a[i] * CRTi[i] % Qi[i]) * qm[i];
+  }
+  res = (((res % q) + q + qs2) % q) - qs2;  // centermod
+  return res;
+}
+void mod_q120::save_as_q120a(void* dest) const {
+  int64_t* d = (int64_t*)dest;
+  for (uint64_t i = 0; i < 4; ++i) {
+    d[i] = a[i] + Qi[i];
+  }
+}
+void mod_q120::save_as_q120b(void* dest) const {
+  int64_t* d = (int64_t*)dest;
+  for (uint64_t i = 0; i < 4; ++i) {
+    d[i] = a[i] + (Qi[i] * (1 + uniform_u64_bits(32)));
+  }
+}
+void mod_q120::save_as_q120c(void* dest) const {
+  int32_t* d = (int32_t*)dest;
+  for (uint64_t i = 0; i < 4; ++i) {
+    d[2 * i] = a[i] + 3 * Qi[i];
+    d[2 * i + 1] = (uint64_t(d[2 * i]) << 32) % uint64_t(Qi[i]);
+  }
+}
+
+mod_q120 uniform_q120() {
+  test_rng& gen = randgen();
+  std::uniform_int_distribution<uint64_t> dista(0, mod_q120::Qi[0]);
+  std::uniform_int_distribution<uint64_t> distb(0, mod_q120::Qi[1]);
+  std::uniform_int_distribution<uint64_t> distc(0, mod_q120::Qi[2]);
+  std::uniform_int_distribution<uint64_t> distd(0, mod_q120::Qi[3]);
+  return mod_q120(dista(gen), distb(gen), distc(gen), distd(gen));
+}
+
+void uniform_q120a(void* dest) {
+  uint64_t* res = (uint64_t*)dest;
+  for (uint64_t i = 0; i < 4; ++i) {
+    res[i] = uniform_u64_bits(32);
+  }
+}
+
+void uniform_q120b(void* dest) {
+  uint64_t* res = (uint64_t*)dest;
+  for (uint64_t i = 0; i < 4; ++i) {
+    res[i] = uniform_u64();
+  }
+}
+
+void uniform_q120c(void* dest) {
+  uint32_t* res = (uint32_t*)dest;
+  static const uint64_t _2p32 = uint64_t(1) << 32;
+  for (uint64_t i = 0, k = 0; i < 8; i += 2, ++k) {
+    const uint64_t q = mod_q120::Qi[k];
+    const uint64_t z = uniform_u64_bits(32);
+    const uint64_t z_pow_red = (z << 32) % q;
+    const uint64_t room = (_2p32 - z_pow_red) / q;
+    const uint64_t z_pow = z_pow_red + (uniform_u64() % room) * q;
+    REQUIRE_DRAMATICALLY(z < _2p32, "bug!");
+    REQUIRE_DRAMATICALLY(z_pow < _2p32, "bug!");
+    REQUIRE_DRAMATICALLY(z_pow % q == (z << 32) % q, "bug!");
+
+    res[i] = (uint32_t)z;
+    res[i + 1] = (uint32_t)z_pow;
+  }
+}
--- a/spqlios/lib/test/testlib/mod_q120.h
+++ b/spqlios/lib/test/testlib/mod_q120.h
@@ -0,0 +1,49 @@
+#ifndef SPQLIOS_MOD_Q120_H
+#define SPQLIOS_MOD_Q120_H
+
+#include <cstdint>
+
+#include "../../spqlios/q120/q120_common.h"
+#include "test_commons.h"
+
+/** @brief centered modulo q */
+int64_t centermod(int64_t v, int64_t q);
+int64_t centermod(uint64_t v, int64_t q);
+
+/** @brief this class represents an integer mod Q120 */
+class mod_q120 {
+ public:
+  static constexpr int64_t Qi[] = {Q1, Q2, Q3, Q4};
+  int64_t a[4];
+  mod_q120(int64_t a1, int64_t a2, int64_t a3, int64_t a4);
+  mod_q120();
+  __int128_t to_int128() const;
+  static mod_q120 from_q120a(const void* addr);
+  static mod_q120 from_q120b(const void* addr);
+  static mod_q120 from_q120c(const void* addr);
+  void save_as_q120a(void* dest) const;
+  void save_as_q120b(void* dest) const;
+  void save_as_q120c(void* dest) const;
+};
+
+mod_q120 operator+(const mod_q120& x, const mod_q120& y);
+mod_q120 operator-(const mod_q120& x, const mod_q120& y);
+mod_q120 operator*(const mod_q120& x, const mod_q120& y);
+mod_q120& operator+=(mod_q120& x, const mod_q120& y);
+mod_q120& operator-=(mod_q120& x, const mod_q120& y);
+mod_q120& operator*=(mod_q120& x, const mod_q120& y);
+std::ostream& operator<<(std::ostream& out, const mod_q120& x);
+bool operator==(const mod_q120& x, const mod_q120& y);
+mod_q120 pow(const mod_q120& x, int32_t k);
+mod_q120 half(const mod_q120& x);
+
+/** @brief a uniformly drawn number mod Q120 */
+mod_q120 uniform_q120();
+/** @brief a uniformly random mod Q120 layout A (4 integers < 2^32) */
+void uniform_q120a(void* dest);
+/** @brief a uniformly random mod Q120 layout B (4 integers < 2^64) */
+void uniform_q120b(void* dest);
+/** @brief a uniformly random mod Q120 layout C (4 integers repr. x,2^32x) */
+void uniform_q120c(void* dest);
+
+#endif  // SPQLIOS_MOD_Q120_H
--- a/spqlios/lib/test/testlib/negacyclic_polynomial.cpp
+++ b/spqlios/lib/test/testlib/negacyclic_polynomial.cpp
@@ -0,0 +1,18 @@
+#include "negacyclic_polynomial_impl.h"
+
+// explicit instantiation
+EXPLICIT_INSTANTIATE_POLYNOMIAL(__int128_t);
+EXPLICIT_INSTANTIATE_POLYNOMIAL(int64_t);
+EXPLICIT_INSTANTIATE_POLYNOMIAL(double);
+
+double infty_dist(const rnx_f64& a, const rnx_f64& b) {
+  const uint64_t nn = a.nn();
+  const double* aa = a.data();
+  const double* bb = b.data();
+  double res = 0.;
+  for (uint64_t i = 0; i < nn; ++i) {
+    double d = fabs(aa[i] - bb[i]);
+    if (d > res) res = d;
+  }
+  return res;
+}
--- a/spqlios/lib/test/testlib/negacyclic_polynomial.h
+++ b/spqlios/lib/test/testlib/negacyclic_polynomial.h
@@ -0,0 +1,69 @@
+#ifndef SPQLIOS_NEGACYCLIC_POLYNOMIAL_H
+#define SPQLIOS_NEGACYCLIC_POLYNOMIAL_H
+
+#include <cstdint>
+
+#include "test_commons.h"
+
+template <typename T>
+class polynomial;
+typedef polynomial<__int128_t> znx_i128;
+typedef polynomial<int64_t> znx_i64;
+typedef polynomial<double> rnx_f64;
+
+template <typename T>
+class polynomial {
+ public:
+  std::vector<T> coeffs;
+  /** @brief create a polynomial out of existing coeffs */
+  polynomial(uint64_t N, const T* c);
+  /** @brief zero polynomial of dimension N */
+  explicit polynomial(uint64_t N);
+  /** @brief empty polynomial (dim 0) */
+  polynomial();
+
+  /** @brief ring dimension */
+  uint64_t nn() const;
+  /** @brief special setter (accept any indexes, and does the negacyclic translation) */
+  void set_coeff(int64_t i, T v);
+  /** @brief special getter (accept any indexes, and does the negacyclic translation) */
+  T get_coeff(int64_t i) const;
+  /** @brief returns the coefficient layout */
+  T* data();
+  /** @brief returns the coefficient layout (const version) */
+  const T* data() const;
+  /** @brief saves to the layout */
+  void save_as(T* dest) const;
+  /** @brief zero */
+  static polynomial<T> zero(uint64_t n);
+  /** @brief random polynomial with coefficients in [-2^log2bounds, 2^log2bounds]*/
+  static polynomial<T> random_log2bound(uint64_t n, uint64_t log2bound);
+  /** @brief random polynomial with coefficients in [-2^log2bounds, 2^log2bounds]*/
+  static polynomial<T> random(uint64_t n);
+  /** @brief random polynomial with coefficient in [lb;ub] */
+  static polynomial<T> random_bound(uint64_t n, const T lb, const T ub);
+};
+
+/** @brief equality operator (used during tests) */
+template <typename T>
+bool operator==(const polynomial<T>& a, const polynomial<T>& b);
+
+/** @brief addition operator (used during tests) */
+template <typename T>
+polynomial<T> operator+(const polynomial<T>& a, const polynomial<T>& b);
+
+/** @brief subtraction operator (used during tests) */
+template <typename T>
+polynomial<T> operator-(const polynomial<T>& a, const polynomial<T>& b);
+
+/** @brief negation operator (used during tests) */
+template <typename T>
+polynomial<T> operator-(const polynomial<T>& a);
+
+template <typename T>
+polynomial<T> naive_product(const polynomial<T>& a, const polynomial<T>& b);
+
+/** @brief distance between two real polynomials (used during tests) */
+double infty_dist(const rnx_f64& a, const rnx_f64& b);
+
+#endif  // SPQLIOS_NEGACYCLIC_POLYNOMIAL_H
--- a/spqlios/lib/test/testlib/negacyclic_polynomial_impl.h
+++ b/spqlios/lib/test/testlib/negacyclic_polynomial_impl.h
@@ -0,0 +1,247 @@
+#ifndef SPQLIOS_NEGACYCLIC_POLYNOMIAL_IMPL_H
+#define SPQLIOS_NEGACYCLIC_POLYNOMIAL_IMPL_H
+
+#include "negacyclic_polynomial.h"
+
+template <typename T>
+polynomial<T>::polynomial(uint64_t N, const T* c) : coeffs(N) {
+  for (uint64_t i = 0; i < N; ++i) coeffs[i] = c[i];
+}
+/** @brief zero polynomial of dimension N */
+template <typename T>
+polynomial<T>::polynomial(uint64_t N) : coeffs(N, 0) {}
+/** @brief empty polynomial (dim 0) */
+template <typename T>
+polynomial<T>::polynomial() {}
+
+/** @brief ring dimension */
+template <typename T>
+uint64_t polynomial<T>::nn() const {
+  uint64_t n = coeffs.size();
+  REQUIRE_DRAMATICALLY(is_pow2(n), "polynomial dim is not a pow of 2");
+  return n;
+}
+
+/** @brief special setter (accept any indexes, and does the negacyclic translation) */
+template <typename T>
+void polynomial<T>::set_coeff(int64_t i, T v) {
+  const uint64_t n = nn();
+  const uint64_t _2nm = 2 * n - 1;
+  uint64_t pos = uint64_t(i) & _2nm;
+  if (pos < n) {
+    coeffs[pos] = v;
+  } else {
+    coeffs[pos - n] = -v;
+  }
+}
+/** @brief special getter (accept any indexes, and does the negacyclic translation) */
+template <typename T>
+T polynomial<T>::get_coeff(int64_t i) const {
+  const uint64_t n = nn();
+  const uint64_t _2nm = 2 * n - 1;
+  uint64_t pos = uint64_t(i) & _2nm;
+  if (pos < n) {
+    return coeffs[pos];
+  } else {
+    return -coeffs[pos - n];
+  }
+}
+/** @brief returns the coefficient layout */
+template <typename T>
+T* polynomial<T>::data() {
+  return coeffs.data();
+}
+
+template <typename T>
+void polynomial<T>::save_as(T* dest) const {
+  const uint64_t n = nn();
+  for (uint64_t i = 0; i < n; ++i) {
+    dest[i] = coeffs[i];
+  }
+}
+
+/** @brief returns the coefficient layout (const version) */
+template <typename T>
+const T* polynomial<T>::data() const {
+  return coeffs.data();
+}
+
+/** @brief returns the coefficient layout (const version) */
+template <typename T>
+polynomial<T> polynomial<T>::zero(uint64_t n) {
+  return polynomial<T>(n);
+}
+
+/** @brief equality operator (used during tests) */
+template <typename T>
+bool operator==(const polynomial<T>& a, const polynomial<T>& b) {
+  uint64_t n = a.nn();
+  REQUIRE_DRAMATICALLY(b.nn() == n, "wrong dimensions");
+  for (uint64_t i = 0; i < n; ++i) {
+    if (a.get_coeff(i) != b.get_coeff(i)) return false;
+  }
+  return true;
+}
+
+/** @brief addition operator (used during tests) */
+template <typename T>
+polynomial<T> operator+(const polynomial<T>& a, const polynomial<T>& b) {
+  uint64_t n = a.nn();
+  REQUIRE_DRAMATICALLY(b.nn() == n, "wrong dimensions");
+  polynomial<T> res(n);
+  for (uint64_t i = 0; i < n; ++i) {
+    res.set_coeff(i, a.get_coeff(i) + b.get_coeff(i));
+  }
+  return res;
+}
+
+/** @brief subtraction operator (used during tests) */
+template <typename T>
+polynomial<T> operator-(const polynomial<T>& a, const polynomial<T>& b) {
+  uint64_t n = a.nn();
+  REQUIRE_DRAMATICALLY(b.nn() == n, "wrong dimensions");
+  polynomial<T> res(n);
+  for (uint64_t i = 0; i < n; ++i) {
+    res.set_coeff(i, a.get_coeff(i) - b.get_coeff(i));
+  }
+  return res;
+}
+
+/** @brief subtraction operator (used during tests) */
+template <typename T>
+polynomial<T> operator-(const polynomial<T>& a) {
+  uint64_t n = a.nn();
+  polynomial<T> res(n);
+  for (uint64_t i = 0; i < n; ++i) {
+    res.set_coeff(i, -a.get_coeff(i));
+  }
+  return res;
+}
+
+/** @brief random polynomial */
+template <typename T>
+polynomial<T> random_polynomial(uint64_t n);
+
+/** @brief random int64 polynomial */
+template <>
+polynomial<int64_t> random_polynomial(uint64_t n) {
+  polynomial<int64_t> res(n);
+  for (uint64_t i = 0; i < n; ++i) {
+    res.set_coeff(i, uniform_i64());
+  }
+  return res;
+}
+
+/** @brief random float64 gaussian polynomial */
+template <>
+polynomial<double> random_polynomial(uint64_t n) {
+  polynomial<double> res(n);
+  for (uint64_t i = 0; i < n; ++i) {
+    res.set_coeff(i, random_f64_gaussian());
+  }
+  return res;
+}
+
+template <typename T>
+polynomial<T> random_polynomial_bounds(uint64_t n, const T lb, const T ub);
+
+/** @brief random int64 polynomial */
+template <>
+polynomial<int64_t> random_polynomial_bounds(uint64_t n, const int64_t lb, const int64_t ub) {
+  polynomial<int64_t> res(n);
+  for (uint64_t i = 0; i < n; ++i) {
+    res.set_coeff(i, uniform_i64_bounds(lb, ub));
+  }
+  return res;
+}
+
+/** @brief random float64 gaussian polynomial */
+template <>
+polynomial<double> random_polynomial_bounds(uint64_t n, const double lb, const double ub) {
+  polynomial<double> res(n);
+  for (uint64_t i = 0; i < n; ++i) {
+    res.set_coeff(i, uniform_f64_bounds(lb, ub));
+  }
+  return res;
+}
+
+/** @brief random int64 polynomial */
+template <>
+polynomial<__int128_t> random_polynomial_bounds(uint64_t n, const __int128_t lb, const __int128_t ub) {
+  polynomial<__int128_t> res(n);
+  for (uint64_t i = 0; i < n; ++i) {
+    res.set_coeff(i, uniform_i128_bounds(lb, ub));
+  }
+  return res;
+}
+
+template <typename T>
+polynomial<T> random_polynomial_bits(uint64_t n, const uint64_t bits) {
+  T b = UINT64_C(1) << bits;
+  return random_polynomial_bounds(n, -b, b);
+}
+
+template <>
+polynomial<int64_t> polynomial<int64_t>::random_log2bound(uint64_t n, uint64_t log2bound) {
+  polynomial<int64_t> res(n);
+  for (uint64_t i = 0; i < n; ++i) {
+    res.set_coeff(i, uniform_i64_bits(log2bound));
+  }
+  return res;
+}
+
+template <>
+polynomial<int64_t> polynomial<int64_t>::random(uint64_t n) {
+  polynomial<int64_t> res(n);
+  for (uint64_t i = 0; i < n; ++i) {
+    res.set_coeff(i, uniform_u64());
+  }
+  return res;
+}
+
+template <>
+polynomial<double> polynomial<double>::random_log2bound(uint64_t n, uint64_t log2bound) {
+  polynomial<double> res(n);
+  double bound = pow(2., log2bound);
+  for (uint64_t i = 0; i < n; ++i) {
+    res.set_coeff(i, uniform_f64_bounds(-bound, bound));
+  }
+  return res;
+}
+
+template <>
+polynomial<double> polynomial<double>::random(uint64_t n) {
+  polynomial<double> res(n);
+  double bound = 2.;
+  for (uint64_t i = 0; i < n; ++i) {
+    res.set_coeff(i, uniform_f64_bounds(-bound, bound));
+  }
+  return res;
+}
+
+template <typename T>
+polynomial<T> naive_product(const polynomial<T>& a, const polynomial<T>& b) {
+  const int64_t nn = a.nn();
+  REQUIRE_DRAMATICALLY(b.nn() == uint64_t(nn), "dimension mismatch!");
+  polynomial<T> res(nn);
+  for (int64_t i = 0; i < nn; ++i) {
+    T ri = 0;
+    for (int64_t j = 0; j < nn; ++j) {
+      ri += a.get_coeff(j) * b.get_coeff(i - j);
+    }
+    res.set_coeff(i, ri);
+  }
+  return res;
+}
+
+#define EXPLICIT_INSTANTIATE_POLYNOMIAL(TYPE)                                                    \
+  template class polynomial<TYPE>;                                                               \
+  template bool operator==(const polynomial<TYPE>& a, const polynomial<TYPE>& b);                \
+  template polynomial<TYPE> operator+(const polynomial<TYPE>& a, const polynomial<TYPE>& b);     \
+  template polynomial<TYPE> operator-(const polynomial<TYPE>& a, const polynomial<TYPE>& b);     \
+  template polynomial<TYPE> operator-(const polynomial<TYPE>& a);                                \
+  template polynomial<TYPE> random_polynomial_bits(uint64_t n, const uint64_t bits);             \
+  template polynomial<TYPE> naive_product(const polynomial<TYPE>& a, const polynomial<TYPE>& b); \
+  // template polynomial<TYPE> random_polynomial(uint64_t n);
+
+#endif  // SPQLIOS_NEGACYCLIC_POLYNOMIAL_IMPL_H
--- a/spqlios/lib/test/testlib/ntt120_dft.cpp
+++ b/spqlios/lib/test/testlib/ntt120_dft.cpp
@@ -0,0 +1,122 @@
+#include "ntt120_dft.h"
+
+#include "mod_q120.h"
+
+// @brief alternative version of the NTT
+
+/** for all s=k/2^17, root_of_unity(s) = omega_0^k */
+static mod_q120 root_of_unity(double s) {
+  static mod_q120 omega_2pow17{OMEGA1, OMEGA2, OMEGA3, OMEGA4};
+  static double _2pow17 = 1 << 17;
+  return pow(omega_2pow17, s * _2pow17);
+}
+static mod_q120 root_of_unity_inv(double s) {
+  static mod_q120 omega_2pow17{OMEGA1, OMEGA2, OMEGA3, OMEGA4};
+  static double _2pow17 = 1 << 17;
+  return pow(omega_2pow17, -s * _2pow17);
+}
+
+/** recursive naive ntt */
+static void q120_ntt_naive_rec(uint64_t n, double entry_pwr, mod_q120* data) {
+  if (n == 1) return;
+  const uint64_t h = n / 2;
+  const double s = entry_pwr / 2.;
+  mod_q120 om = root_of_unity(s);
+  for (uint64_t j = 0; j < h; ++j) {
+    mod_q120 om_right = data[h + j] * om;
+    data[h + j] = data[j] - om_right;
+    data[j] = data[j] + om_right;
+  }
+  q120_ntt_naive_rec(h, s, data);
+  q120_ntt_naive_rec(h, s + 0.5, data + h);
+}
+static void q120_intt_naive_rec(uint64_t n, double entry_pwr, mod_q120* data) {
+  if (n == 1) return;
+  const uint64_t h = n / 2;
+  const double s = entry_pwr / 2.;
+  q120_intt_naive_rec(h, s, data);
+  q120_intt_naive_rec(h, s + 0.5, data + h);
+  mod_q120 om = root_of_unity_inv(s);
+  for (uint64_t j = 0; j < h; ++j) {
+    mod_q120 dat_diff = half(data[j] - data[h + j]);
+    data[j] = half(data[j] + data[h + j]);
+    data[h + j] = dat_diff * om;
+  }
+}
+
+/** user friendly version */
+q120_nttvec simple_ntt120(const znx_i64& polynomial) {
+  const uint64_t n = polynomial.nn();
+  q120_nttvec res(n);
+  for (uint64_t i = 0; i < n; ++i) {
+    int64_t xi = polynomial.get_coeff(i);
+    res.v[i] = mod_q120(xi, xi, xi, xi);
+  }
+  q120_ntt_naive_rec(n, 0.5, res.v.data());
+  return res;
+}
+
+znx_i128 simple_intt120(const q120_nttvec& fftvec) {
+  const uint64_t n = fftvec.nn();
+  q120_nttvec copy = fftvec;
+  znx_i128 res(n);
+  q120_intt_naive_rec(n, 0.5, copy.v.data());
+  for (uint64_t i = 0; i < n; ++i) {
+    res.set_coeff(i, copy.v[i].to_int128());
+  }
+  return res;
+}
+bool operator==(const q120_nttvec& a, const q120_nttvec& b) { return a.v == b.v; }
+
+std::vector<mod_q120> q120_ntt_naive(const std::vector<mod_q120>& x) {
+  std::vector<mod_q120> res = x;
+  q120_ntt_naive_rec(res.size(), 0.5, res.data());
+  return res;
+}
+q120_nttvec::q120_nttvec(uint64_t n) : v(n) {}
+q120_nttvec::q120_nttvec(uint64_t n, const q120b* data) : v(n) {
+  int64_t* d = (int64_t*)data;
+  for (uint64_t i = 0; i < n; ++i) {
+    v[i] = mod_q120::from_q120b(d + 4 * i);
+  }
+}
+q120_nttvec::q120_nttvec(uint64_t n, const q120c* data) : v(n) {
+  int64_t* d = (int64_t*)data;
+  for (uint64_t i = 0; i < n; ++i) {
+    v[i] = mod_q120::from_q120c(d + 4 * i);
+  }
+}
+uint64_t q120_nttvec::nn() const { return v.size(); }
+q120_nttvec q120_nttvec::zero(uint64_t n) { return q120_nttvec(n); }
+void q120_nttvec::save_as(q120a* dest) const {
+  int64_t* const d = (int64_t*)dest;
+  const uint64_t n = nn();
+  for (uint64_t i = 0; i < n; ++i) {
+    v[i].save_as_q120a(d + 4 * i);
+  }
+}
+void q120_nttvec::save_as(q120b* dest) const {
+  int64_t* const d = (int64_t*)dest;
+  const uint64_t n = nn();
+  for (uint64_t i = 0; i < n; ++i) {
+    v[i].save_as_q120b(d + 4 * i);
+  }
+}
+void q120_nttvec::save_as(q120c* dest) const {
+  int64_t* const d = (int64_t*)dest;
+  const uint64_t n = nn();
+  for (uint64_t i = 0; i < n; ++i) {
+    v[i].save_as_q120c(d + 4 * i);
+  }
+}
+mod_q120 q120_nttvec::get_blk(uint64_t blk) const {
+  REQUIRE_DRAMATICALLY(blk < nn(), "blk overflow");
+  return v[blk];
+}
+q120_nttvec q120_nttvec::random(uint64_t n) {
+  q120_nttvec res(n);
+  for (uint64_t i = 0; i < n; ++i) {
+    res.v[i] = uniform_q120();
+  }
+  return res;
+}
--- a/spqlios/lib/test/testlib/ntt120_dft.h
+++ b/spqlios/lib/test/testlib/ntt120_dft.h
@@ -0,0 +1,31 @@
+#ifndef SPQLIOS_NTT120_DFT_H
+#define SPQLIOS_NTT120_DFT_H
+
+#include <vector>
+
+#include "../../spqlios/q120/q120_arithmetic.h"
+#include "mod_q120.h"
+#include "negacyclic_polynomial.h"
+#include "test_commons.h"
+
+class q120_nttvec {
+ public:
+  std::vector<mod_q120> v;
+  q120_nttvec() = default;
+  explicit q120_nttvec(uint64_t n);
+  q120_nttvec(uint64_t n, const q120b* data);
+  q120_nttvec(uint64_t n, const q120c* data);
+  uint64_t nn() const;
+  static q120_nttvec zero(uint64_t n);
+  static q120_nttvec random(uint64_t n);
+  void save_as(q120a* dest) const;
+  void save_as(q120b* dest) const;
+  void save_as(q120c* dest) const;
+  mod_q120 get_blk(uint64_t blk) const;
+};
+
+q120_nttvec simple_ntt120(const znx_i64& polynomial);
+znx_i128 simple_intt120(const q120_nttvec& fftvec);
+bool operator==(const q120_nttvec& a, const q120_nttvec& b);
+
+#endif  // SPQLIOS_NTT120_DFT_H
--- a/spqlios/lib/test/testlib/ntt120_layouts.cpp
+++ b/spqlios/lib/test/testlib/ntt120_layouts.cpp
@@ -0,0 +1,66 @@
+#include "ntt120_layouts.h"
+
+mod_q120x2::mod_q120x2() {}
+mod_q120x2::mod_q120x2(const mod_q120& a, const mod_q120& b) {
+  value[0] = a;
+  value[1] = b;
+}
+mod_q120x2::mod_q120x2(q120x2b* addr) {
+  uint64_t* p = (uint64_t*)addr;
+  value[0] = mod_q120::from_q120b(p);
+  value[1] = mod_q120::from_q120b(p + 4);
+}
+
+ntt120_vec_znx_dft_layout::ntt120_vec_znx_dft_layout(uint64_t n, uint64_t size)
+    : nn(n),       //
+      size(size),  //
+      data((VEC_ZNX_DFT*)alloc64(n * size * 4 * sizeof(uint64_t))) {}
+
+mod_q120x2 ntt120_vec_znx_dft_layout::get_copy_zext(uint64_t idx, uint64_t blk) {
+  return mod_q120x2(get_blk(idx, blk));
+}
+q120x2b* ntt120_vec_znx_dft_layout::get_blk(uint64_t idx, uint64_t blk) {
+  REQUIRE_DRAMATICALLY(idx < size, "idx overflow");
+  REQUIRE_DRAMATICALLY(blk < nn / 2, "blk overflow");
+  uint64_t* d = (uint64_t*)data;
+  return (q120x2b*)(d + 4 * nn * idx + 8 * blk);
+}
+ntt120_vec_znx_dft_layout::~ntt120_vec_znx_dft_layout() { spqlios_free(data); }
+q120_nttvec ntt120_vec_znx_dft_layout::get_copy_zext(uint64_t idx) {
+  int64_t* d = (int64_t*)data;
+  if (idx < size) {
+    return q120_nttvec(nn, (q120b*)(d + idx * nn * 4));
+  } else {
+    return q120_nttvec::zero(nn);
+  }
+}
+void ntt120_vec_znx_dft_layout::set(uint64_t idx, const q120_nttvec& value) {
+  REQUIRE_DRAMATICALLY(idx < size, "index overflow: " << idx << " / " << size);
+  q120b* dest_addr = (q120b*)((int64_t*)data + idx * nn * 4);
+  value.save_as(dest_addr);
+}
+void ntt120_vec_znx_dft_layout::fill_random() {
+  for (uint64_t i = 0; i < size; ++i) {
+    set(i, q120_nttvec::random(nn));
+  }
+}
+thash ntt120_vec_znx_dft_layout::content_hash() const { return test_hash(data, nn * size * 4 * sizeof(int64_t)); }
+ntt120_vec_znx_big_layout::ntt120_vec_znx_big_layout(uint64_t n, uint64_t size)
+    : nn(n),  //
+      size(size),
+      data((VEC_ZNX_BIG*)alloc64(n * size * sizeof(__int128_t))) {}
+
+znx_i128 ntt120_vec_znx_big_layout::get_copy(uint64_t index) const { return znx_i128(nn, get_addr(index)); }
+znx_i128 ntt120_vec_znx_big_layout::get_copy_zext(uint64_t index) const {
+  if (index < size) {
+    return znx_i128(nn, get_addr(index));
+  } else {
+    return znx_i128::zero(nn);
+  }
+}
+__int128* ntt120_vec_znx_big_layout::get_addr(uint64_t index) const {
+  REQUIRE_DRAMATICALLY(index < size, "index overflow: " << index << " / " << size);
+  return (__int128_t*)data + index * nn;
+}
+void ntt120_vec_znx_big_layout::set(uint64_t index, const znx_i128& value) { value.save_as(get_addr(index)); }
+ntt120_vec_znx_big_layout::~ntt120_vec_znx_big_layout() { spqlios_free(data); }
--- a/spqlios/lib/test/testlib/ntt120_layouts.h
+++ b/spqlios/lib/test/testlib/ntt120_layouts.h
@@ -0,0 +1,103 @@
+#ifndef SPQLIOS_NTT120_LAYOUTS_H
+#define SPQLIOS_NTT120_LAYOUTS_H
+
+#include "../../spqlios/arithmetic/vec_znx_arithmetic.h"
+#include "mod_q120.h"
+#include "negacyclic_polynomial.h"
+#include "ntt120_dft.h"
+#include "test_commons.h"
+
+struct q120b_vector_view {};
+
+struct mod_q120x2 {
+  mod_q120 value[2];
+  mod_q120x2();
+  mod_q120x2(const mod_q120& a, const mod_q120& b);
+  mod_q120x2(__int128_t value);
+  explicit mod_q120x2(q120x2b* addr);
+  explicit mod_q120x2(q120x2c* addr);
+  void save_as(q120x2b* addr) const;
+  void save_as(q120x2c* addr) const;
+  static mod_q120x2 random();
+};
+mod_q120x2 operator+(const mod_q120x2& a, const mod_q120x2& b);
+mod_q120x2 operator-(const mod_q120x2& a, const mod_q120x2& b);
+mod_q120x2 operator*(const mod_q120x2& a, const mod_q120x2& b);
+bool operator==(const mod_q120x2& a, const mod_q120x2& b);
+bool operator!=(const mod_q120x2& a, const mod_q120x2& b);
+mod_q120x2& operator+=(mod_q120x2& a, const mod_q120x2& b);
+mod_q120x2& operator-=(mod_q120x2& a, const mod_q120x2& b);
+
+/** @brief test layout for the VEC_ZNX_DFT */
+struct ntt120_vec_znx_dft_layout {
+  const uint64_t nn;
+  const uint64_t size;
+  VEC_ZNX_DFT* const data;
+  ntt120_vec_znx_dft_layout(uint64_t n, uint64_t size);
+  mod_q120x2 get_copy_zext(uint64_t idx, uint64_t blk);
+  q120_nttvec get_copy_zext(uint64_t idx);
+  void set(uint64_t idx, const q120_nttvec& v);
+  q120x2b* get_blk(uint64_t idx, uint64_t blk);
+  thash content_hash() const;
+  void fill_random();
+  ~ntt120_vec_znx_dft_layout();
+};
+
+/** @brief test layout for the VEC_ZNX_BIG */
+class ntt120_vec_znx_big_layout {
+ public:
+  const uint64_t nn;
+  const uint64_t size;
+  VEC_ZNX_BIG* const data;
+  ntt120_vec_znx_big_layout(uint64_t n, uint64_t size);
+
+ private:
+  __int128* get_addr(uint64_t index) const;
+
+ public:
+  znx_i128 get_copy(uint64_t index) const;
+  znx_i128 get_copy_zext(uint64_t index) const;
+  void set(uint64_t index, const znx_i128& value);
+  ~ntt120_vec_znx_big_layout();
+};
+
+/** @brief test layout for the VMP_PMAT */
+class ntt120_vmp_pmat_layout {
+  const uint64_t nn;
+  const uint64_t nrows;
+  const uint64_t ncols;
+  VMP_PMAT* const data;
+  ntt120_vmp_pmat_layout(uint64_t n, uint64_t nrows, uint64_t ncols);
+  mod_q120x2 get(uint64_t row, uint64_t col, uint64_t blk) const;
+  ~ntt120_vmp_pmat_layout();
+};
+
+/** @brief test layout for the SVP_PPOL */
+class ntt120_svp_ppol_layout {
+  const uint64_t nn;
+  SVP_PPOL* const data;
+  ntt120_svp_ppol_layout(uint64_t n);
+  ~ntt120_svp_ppol_layout();
+};
+
+/** @brief test layout for the CNV_PVEC_L */
+class ntt120_cnv_left_layout {
+  const uint64_t nn;
+  const uint64_t size;
+  CNV_PVEC_L* const data;
+  ntt120_cnv_left_layout(uint64_t n, uint64_t size);
+  mod_q120x2 get(uint64_t idx, uint64_t blk);
+  ~ntt120_cnv_left_layout();
+};
+
+/** @brief test layout for the CNV_PVEC_R */
+class ntt120_cnv_right_layout {
+  const uint64_t nn;
+  const uint64_t size;
+  CNV_PVEC_R* const data;
+  ntt120_cnv_right_layout(uint64_t n, uint64_t size);
+  mod_q120x2 get(uint64_t idx, uint64_t blk);
+  ~ntt120_cnv_right_layout();
+};
+
+#endif  // SPQLIOS_NTT120_LAYOUTS_H
--- a/spqlios/lib/test/testlib/polynomial_vector.cpp
+++ b/spqlios/lib/test/testlib/polynomial_vector.cpp
@@ -0,0 +1,69 @@
+#include "polynomial_vector.h"
+
+#include <cstring>
+
+#ifdef VALGRIND_MEM_TESTS
+#include "valgrind/memcheck.h"
+#endif
+
+#define CANARY_PADDING (1024)
+#define GARBAGE_VALUE (242)
+
+znx_vec_i64_layout::znx_vec_i64_layout(uint64_t n, uint64_t size, uint64_t slice) : n(n), size(size), slice(slice) {
+  REQUIRE_DRAMATICALLY(is_pow2(n), "not a power of 2" << n);
+  REQUIRE_DRAMATICALLY(slice >= n, "slice too small" << slice << " < " << n);
+  this->region = (uint8_t*)malloc(size * slice * sizeof(int64_t) + 2 * CANARY_PADDING);
+  this->data_start = (int64_t*)(region + CANARY_PADDING);
+  // ensure that any invalid value is kind-of garbage
+  memset(region, GARBAGE_VALUE, size * slice * sizeof(int64_t) + 2 * CANARY_PADDING);
+  // mark inter-slice memory as non accessible
+#ifdef VALGRIND_MEM_TESTS
+  VALGRIND_MAKE_MEM_NOACCESS(region, CANARY_PADDING);
+  VALGRIND_MAKE_MEM_NOACCESS(region + size * slice * sizeof(int64_t) + CANARY_PADDING, CANARY_PADDING);
+  for (uint64_t i = 0; i < size; ++i) {
+    VALGRIND_MAKE_MEM_UNDEFINED(data_start + i * slice, n * sizeof(int64_t));
+  }
+  if (size != slice) {
+    for (uint64_t i = 0; i < size; ++i) {
+      VALGRIND_MAKE_MEM_NOACCESS(data_start + i * slice + n, (slice - n) * sizeof(int64_t));
+    }
+  }
+#endif
+}
+
+znx_vec_i64_layout::~znx_vec_i64_layout() { free(region); }
+
+znx_i64 znx_vec_i64_layout::get_copy_zext(uint64_t index) const {
+  if (index < size) {
+    return znx_i64(n, data_start + index * slice);
+  } else {
+    return znx_i64::zero(n);
+  }
+}
+
+znx_i64 znx_vec_i64_layout::get_copy(uint64_t index) const {
+  REQUIRE_DRAMATICALLY(index < size, "index overflow: " << index << " / " << size);
+  return znx_i64(n, data_start + index * slice);
+}
+
+void znx_vec_i64_layout::set(uint64_t index, const znx_i64& elem) {
+  REQUIRE_DRAMATICALLY(index < size, "index overflow: " << index << " / " << size);
+  REQUIRE_DRAMATICALLY(elem.nn() == n, "incompatible ring dimensions: " << elem.nn() << " / " << n);
+  elem.save_as(data_start + index * slice);
+}
+
+int64_t* znx_vec_i64_layout::data() { return data_start; }
+const int64_t* znx_vec_i64_layout::data() const { return data_start; }
+
+void znx_vec_i64_layout::fill_random(uint64_t bits) {
+  for (uint64_t i = 0; i < size; ++i) {
+    set(i, znx_i64::random_log2bound(n, bits));
+  }
+}
+__uint128_t znx_vec_i64_layout::content_hash() const {
+  test_hasher hasher;
+  for (uint64_t i = 0; i < size; ++i) {
+    hasher.update(data() + i * slice, n * sizeof(int64_t));
+  }
+  return hasher.hash();
+}
--- a/spqlios/lib/test/testlib/polynomial_vector.h
+++ b/spqlios/lib/test/testlib/polynomial_vector.h
@@ -0,0 +1,42 @@
+#ifndef SPQLIOS_POLYNOMIAL_VECTOR_H
+#define SPQLIOS_POLYNOMIAL_VECTOR_H
+
+#include "negacyclic_polynomial.h"
+#include "test_commons.h"
+
+/** @brief a test memory layout for znx i64 polynomials vectors */
+class znx_vec_i64_layout {
+  uint64_t n;
+  uint64_t size;
+  uint64_t slice;
+  int64_t* data_start;
+  uint8_t* region;
+
+ public:
+  // NO-COPY structure
+  znx_vec_i64_layout(const znx_vec_i64_layout&) = delete;
+  void operator=(const znx_vec_i64_layout&) = delete;
+  znx_vec_i64_layout(znx_vec_i64_layout&&) = delete;
+  void operator=(znx_vec_i64_layout&&) = delete;
+  /** @brief initialises a memory layout */
+  znx_vec_i64_layout(uint64_t n, uint64_t size, uint64_t slice);
+  /** @brief destructor */
+  ~znx_vec_i64_layout();
+
+  /** @brief get a copy of item index index (extended with zeros) */
+  znx_i64 get_copy_zext(uint64_t index) const;
+  /** @brief get a copy of item index index (extended with zeros) */
+  znx_i64 get_copy(uint64_t index) const;
+  /** @brief get a copy of item index index (index<size) */
+  void set(uint64_t index, const znx_i64& elem);
+  /** @brief fill with random values */
+  void fill_random(uint64_t bits = 63);
+  /** @brief raw pointer access */
+  int64_t* data();
+  /** @brief raw pointer access (const version) */
+  const int64_t* data() const;
+  /** @brief content hashcode */
+  __uint128_t content_hash() const;
+};
+
+#endif  // SPQLIOS_POLYNOMIAL_VECTOR_H
--- a/spqlios/lib/test/testlib/random.cpp
+++ b/spqlios/lib/test/testlib/random.cpp
@@ -0,0 +1,55 @@
+#include <cstdint>
+#include <random>
+
+#include "test_commons.h"
+
+bool is_pow2(uint64_t n) { return !(n & (n - 1)); }
+
+test_rng& randgen() {
+  static test_rng gen;
+  return gen;
+}
+uint64_t uniform_u64() {
+  static std::uniform_int_distribution<uint64_t> dist64(0, UINT64_MAX);
+  return dist64(randgen());
+}
+
+uint64_t uniform_u64_bits(uint64_t nbits) {
+  if (nbits >= 64) return uniform_u64();
+  return uniform_u64() >> (64 - nbits);
+}
+
+int64_t uniform_i64() {
+  std::uniform_int_distribution<int64_t> dist;
+  return dist(randgen());
+}
+
+int64_t uniform_i64_bits(uint64_t nbits) {
+  int64_t bound = int64_t(1) << nbits;
+  std::uniform_int_distribution<int64_t> dist(-bound, bound);
+  return dist(randgen());
+}
+
+int64_t uniform_i64_bounds(const int64_t lb, const int64_t ub) {
+  std::uniform_int_distribution<int64_t> dist(lb, ub);
+  return dist(randgen());
+}
+
+__int128_t uniform_i128_bounds(const __int128_t lb, const __int128_t ub) {
+  std::uniform_int_distribution<__int128_t> dist(lb, ub);
+  return dist(randgen());
+}
+
+double random_f64_gaussian(double stdev) {
+  std::normal_distribution<double> dist(0, stdev);
+  return dist(randgen());
+}
+
+double uniform_f64_bounds(const double lb, const double ub) {
+  std::uniform_real_distribution<double> dist(lb, ub);
+  return dist(randgen());
+}
+
+double uniform_f64_01() {
+  return uniform_f64_bounds(0, 1);
+}
--- a/spqlios/lib/test/testlib/reim4_elem.cpp
+++ b/spqlios/lib/test/testlib/reim4_elem.cpp
@@ -0,0 +1,145 @@
+#include "reim4_elem.h"
+
+reim4_elem::reim4_elem(const double* re, const double* im) {
+  for (uint64_t i = 0; i < 4; ++i) {
+    value[i] = re[i];
+    value[4 + i] = im[i];
+  }
+}
+reim4_elem::reim4_elem(const double* layout) {
+  for (uint64_t i = 0; i < 8; ++i) {
+    value[i] = layout[i];
+  }
+}
+reim4_elem::reim4_elem() {
+  for (uint64_t i = 0; i < 8; ++i) {
+    value[i] = 0.;
+  }
+}
+void reim4_elem::save_re_im(double* re, double* im) const {
+  for (uint64_t i = 0; i < 4; ++i) {
+    re[i] = value[i];
+    im[i] = value[4 + i];
+  }
+}
+void reim4_elem::save_as(double* reim4) const {
+  for (uint64_t i = 0; i < 8; ++i) {
+    reim4[i] = value[i];
+  }
+}
+reim4_elem reim4_elem::zero() { return reim4_elem(); }
+
+bool operator==(const reim4_elem& x, const reim4_elem& y) {
+  for (uint64_t i = 0; i < 8; ++i) {
+    if (x.value[i] != y.value[i]) return false;
+  }
+  return true;
+}
+
+reim4_elem gaussian_reim4() {
+  test_rng& gen = randgen();
+  std::normal_distribution<double> dist(0, 1);
+  reim4_elem res;
+  for (uint64_t i = 0; i < 8; ++i) {
+    res.value[i] = dist(gen);
+  }
+  return res;
+}
+
+reim4_array_view::reim4_array_view(uint64_t size, double* data) : size(size), data(data) {}
+reim4_elem reim4_array_view::get(uint64_t i) const {
+  REQUIRE_DRAMATICALLY(i < size, "reim4 array overflow");
+  return reim4_elem(data + 8 * i);
+}
+void reim4_array_view::set(uint64_t i, const reim4_elem& value) {
+  REQUIRE_DRAMATICALLY(i < size, "reim4 array overflow");
+  value.save_as(data + 8 * i);
+}
+
+reim_view::reim_view(uint64_t m, double* data) : m(m), data(data) {}
+reim4_elem reim_view::get_blk(uint64_t i) {
+  REQUIRE_DRAMATICALLY(i < m / 4, "block overflow");
+  return reim4_elem(data + 4 * i, data + m + 4 * i);
+}
+void reim_view::set_blk(uint64_t i, const reim4_elem& value) {
+  REQUIRE_DRAMATICALLY(i < m / 4, "block overflow");
+  value.save_re_im(data + 4 * i, data + m + 4 * i);
+}
+
+reim_vector_view::reim_vector_view(uint64_t m, uint64_t nrows, double* data) : m(m), nrows(nrows), data(data) {}
+reim_view reim_vector_view::row(uint64_t row) {
+  REQUIRE_DRAMATICALLY(row < nrows, "row overflow");
+  return reim_view(m, data + 2 * m * row);
+}
+
+/** @brief addition */
+reim4_elem operator+(const reim4_elem& x, const reim4_elem& y) {
+  reim4_elem reps;
+  for (uint64_t i = 0; i < 8; ++i) {
+    reps.value[i] = x.value[i] + y.value[i];
+  }
+  return reps;
+}
+reim4_elem& operator+=(reim4_elem& x, const reim4_elem& y) {
+  for (uint64_t i = 0; i < 8; ++i) {
+    x.value[i] += y.value[i];
+  }
+  return x;
+}
+/** @brief subtraction */
+reim4_elem operator-(const reim4_elem& x, const reim4_elem& y) {
+  reim4_elem reps;
+  for (uint64_t i = 0; i < 8; ++i) {
+    reps.value[i] = x.value[i] + y.value[i];
+  }
+  return reps;
+}
+reim4_elem& operator-=(reim4_elem& x, const reim4_elem& y) {
+  for (uint64_t i = 0; i < 8; ++i) {
+    x.value[i] -= y.value[i];
+  }
+  return x;
+}
+/** @brief product */
+reim4_elem operator*(const reim4_elem& x, const reim4_elem& y) {
+  reim4_elem reps;
+  for (uint64_t i = 0; i < 4; ++i) {
+    double xre = x.value[i];
+    double yre = y.value[i];
+    double xim = x.value[i + 4];
+    double yim = y.value[i + 4];
+    reps.value[i] = xre * yre - xim * yim;
+    reps.value[i + 4] = xre * yim + xim * yre;
+  }
+  return reps;
+}
+/** @brief distance in infty norm */
+double infty_dist(const reim4_elem& x, const reim4_elem& y) {
+  double dist = 0;
+  for (uint64_t i = 0; i < 8; ++i) {
+    double d = fabs(x.value[i] - y.value[i]);
+    if (d > dist) dist = d;
+  }
+  return dist;
+}
+
+std::ostream& operator<<(std::ostream& out, const reim4_elem& x) {
+  out << "[\n";
+  for (uint64_t i = 0; i < 4; ++i) {
+    out << "  re=" << x.value[i] << ", im=" << x.value[i + 4] << "\n";
+  }
+  return out << "]";
+}
+
+reim4_matrix_view::reim4_matrix_view(uint64_t nrows, uint64_t ncols, double* data)
+    : nrows(nrows), ncols(ncols), data(data) {}
+reim4_elem reim4_matrix_view::get(uint64_t row, uint64_t col) const {
+  REQUIRE_DRAMATICALLY(row < nrows, "rows out of bounds" << row << " / " << nrows);
+  REQUIRE_DRAMATICALLY(col < ncols, "cols out of bounds" << col << " / " << ncols);
+  return reim4_elem(data + 8 * (row * ncols + col));
+}
+void reim4_matrix_view::set(uint64_t row, uint64_t col, const reim4_elem& value) {
+  REQUIRE_DRAMATICALLY(row < nrows, "rows out of bounds" << row << " / " << nrows);
+  REQUIRE_DRAMATICALLY(col < ncols, "cols out of bounds" << col << " / " << ncols);
+  value.save_as(data + 8 * (row * ncols + col));
+}
--- a/spqlios/lib/test/testlib/reim4_elem.h
+++ b/spqlios/lib/test/testlib/reim4_elem.h
@@ -0,0 +1,95 @@
+#ifndef SPQLIOS_REIM4_ELEM_H
+#define SPQLIOS_REIM4_ELEM_H
+
+#include "test_commons.h"
+
+/** @brief test class representing one single reim4 element */
+class reim4_elem {
+ public:
+  /** @brief 8 components (4 real parts followed by 4 imag parts) */
+  double value[8];
+  /** @brief constructs from 4 real parts and 4 imaginary parts */
+  reim4_elem(const double* re, const double* im);
+  /** @brief constructs from 8 components */
+  explicit reim4_elem(const double* layout);
+  /** @brief zero */
+  reim4_elem();
+  /** @brief saves the real parts to re and the 4 imag to im */
+  void save_re_im(double* re, double* im) const;
+  /** @brief saves the 8 components to reim4 */
+  void save_as(double* reim4) const;
+  static reim4_elem zero();
+};
+
+/** @brief checks for equality */
+bool operator==(const reim4_elem& x, const reim4_elem& y);
+/** @brief random gaussian reim4 of stdev 1 and mean 0 */
+reim4_elem gaussian_reim4();
+/** @brief addition */
+reim4_elem operator+(const reim4_elem& x, const reim4_elem& y);
+reim4_elem& operator+=(reim4_elem& x, const reim4_elem& y);
+/** @brief subtraction */
+reim4_elem operator-(const reim4_elem& x, const reim4_elem& y);
+reim4_elem& operator-=(reim4_elem& x, const reim4_elem& y);
+/** @brief product */
+reim4_elem operator*(const reim4_elem& x, const reim4_elem& y);
+std::ostream& operator<<(std::ostream& out, const reim4_elem& x);
+/** @brief distance in infty norm */
+double infty_dist(const reim4_elem& x, const reim4_elem& y);
+
+/** @brief test class representing the view of one reim of m complexes */
+class reim4_array_view {
+  uint64_t size;  ///< size of the reim array
+  double* data;   ///< pointer to the start of the array
+ public:
+  /** @brief ininitializes a view at an existing given address */
+  reim4_array_view(uint64_t size, double* data);
+  ;
+  /** @brief gets the i-th element */
+  reim4_elem get(uint64_t i) const;
+  /** @brief sets the i-th element */
+  void set(uint64_t i, const reim4_elem& value);
+};
+
+/** @brief test class representing the view of one matrix of nrowsxncols reim4's */
+class reim4_matrix_view {
+  uint64_t nrows;  ///< number of rows
+  uint64_t ncols;  ///< number of columns
+  double* data;    ///< pointer to the start of the matrix
+ public:
+  /** @brief ininitializes a view at an existing given address */
+  reim4_matrix_view(uint64_t nrows, uint64_t ncols, double* data);
+  /** @brief gets the i-th element */
+  reim4_elem get(uint64_t row, uint64_t col) const;
+  /** @brief sets the i-th element */
+  void set(uint64_t row, uint64_t col, const reim4_elem& value);
+};
+
+/** @brief test class representing the view of one reim of m complexes */
+class reim_view {
+  uint64_t m;    ///< (complex) dimension of the reim polynomial
+  double* data;  ///< address of the start of the reim polynomial
+ public:
+  /** @brief ininitializes a view at an existing given address */
+  reim_view(uint64_t m, double* data);
+  ;
+  /** @brief extracts the i-th reim4 block (i<m/4) */
+  reim4_elem get_blk(uint64_t i);
+  /** @brief sets the i-th reim4 block (i<m/4) */
+  void set_blk(uint64_t i, const reim4_elem& value);
+};
+
+/** @brief view of one contiguous reim vector */
+class reim_vector_view {
+  uint64_t m;      ///< (complex) dimension of the reim polynomial
+  uint64_t nrows;  ///< number of reim polynomials
+  double* data;    ///< address of the start of the reim polynomial
+
+ public:
+  /** @brief ininitializes a view at an existing given address */
+  reim_vector_view(uint64_t m, uint64_t nrows, double* data);
+  /** @brief view of the given reim */
+  reim_view row(uint64_t row);
+};
+
+#endif  // SPQLIOS_REIM4_ELEM_H
--- a/spqlios/lib/test/testlib/sha3.c
+++ b/spqlios/lib/test/testlib/sha3.c
@@ -0,0 +1,168 @@
+// sha3.c
+// 19-Nov-11  Markku-Juhani O. Saarinen <mjos@iki.fi>
+// https://github.com/mjosaarinen/tiny_sha3
+// LICENSE: MIT
+
+// Revised 07-Aug-15 to match with official release of FIPS PUB 202 "SHA3"
+// Revised 03-Sep-15 for portability + OpenSSL - style API
+
+#include "sha3.h"
+
+// update the state with given number of rounds
+
+void sha3_keccakf(uint64_t st[25]) {
+  // constants
+  const uint64_t keccakf_rndc[24] = {0x0000000000000001, 0x0000000000008082, 0x800000000000808a, 0x8000000080008000,
+                                     0x000000000000808b, 0x0000000080000001, 0x8000000080008081, 0x8000000000008009,
+                                     0x000000000000008a, 0x0000000000000088, 0x0000000080008009, 0x000000008000000a,
+                                     0x000000008000808b, 0x800000000000008b, 0x8000000000008089, 0x8000000000008003,
+                                     0x8000000000008002, 0x8000000000000080, 0x000000000000800a, 0x800000008000000a,
+                                     0x8000000080008081, 0x8000000000008080, 0x0000000080000001, 0x8000000080008008};
+  const int keccakf_rotc[24] = {1,  3,  6,  10, 15, 21, 28, 36, 45, 55, 2,  14,
+                                27, 41, 56, 8,  25, 43, 62, 18, 39, 61, 20, 44};
+  const int keccakf_piln[24] = {10, 7, 11, 17, 18, 3, 5, 16, 8, 21, 24, 4, 15, 23, 19, 13, 12, 2, 20, 14, 22, 9, 6, 1};
+
+  // variables
+  int i, j, r;
+  uint64_t t, bc[5];
+
+#if __BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__
+  uint8_t* v;
+
+  // endianess conversion. this is redundant on little-endian targets
+  for (i = 0; i < 25; i++) {
+    v = (uint8_t*)&st[i];
+    st[i] = ((uint64_t)v[0]) | (((uint64_t)v[1]) << 8) | (((uint64_t)v[2]) << 16) | (((uint64_t)v[3]) << 24) |
+            (((uint64_t)v[4]) << 32) | (((uint64_t)v[5]) << 40) | (((uint64_t)v[6]) << 48) | (((uint64_t)v[7]) << 56);
+  }
+#endif
+
+  // actual iteration
+  for (r = 0; r < KECCAKF_ROUNDS; r++) {
+    // Theta
+    for (i = 0; i < 5; i++) bc[i] = st[i] ^ st[i + 5] ^ st[i + 10] ^ st[i + 15] ^ st[i + 20];
+
+    for (i = 0; i < 5; i++) {
+      t = bc[(i + 4) % 5] ^ ROTL64(bc[(i + 1) % 5], 1);
+      for (j = 0; j < 25; j += 5) st[j + i] ^= t;
+    }
+
+    // Rho Pi
+    t = st[1];
+    for (i = 0; i < 24; i++) {
+      j = keccakf_piln[i];
+      bc[0] = st[j];
+      st[j] = ROTL64(t, keccakf_rotc[i]);
+      t = bc[0];
+    }
+
+    //  Chi
+    for (j = 0; j < 25; j += 5) {
+      for (i = 0; i < 5; i++) bc[i] = st[j + i];
+      for (i = 0; i < 5; i++) st[j + i] ^= (~bc[(i + 1) % 5]) & bc[(i + 2) % 5];
+    }
+
+    //  Iota
+    st[0] ^= keccakf_rndc[r];
+  }
+
+#if __BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__
+  // endianess conversion. this is redundant on little-endian targets
+  for (i = 0; i < 25; i++) {
+    v = (uint8_t*)&st[i];
+    t = st[i];
+    v[0] = t & 0xFF;
+    v[1] = (t >> 8) & 0xFF;
+    v[2] = (t >> 16) & 0xFF;
+    v[3] = (t >> 24) & 0xFF;
+    v[4] = (t >> 32) & 0xFF;
+    v[5] = (t >> 40) & 0xFF;
+    v[6] = (t >> 48) & 0xFF;
+    v[7] = (t >> 56) & 0xFF;
+  }
+#endif
+}
+
+// Initialize the context for SHA3
+
+int sha3_init(sha3_ctx_t* c, int mdlen) {
+  int i;
+
+  for (i = 0; i < 25; i++) c->st.q[i] = 0;
+  c->mdlen = mdlen;
+  c->rsiz = 200 - 2 * mdlen;
+  c->pt = 0;
+
+  return 1;
+}
+
+// update state with more data
+
+int sha3_update(sha3_ctx_t* c, const void* data, size_t len) {
+  size_t i;
+  int j;
+
+  j = c->pt;
+  for (i = 0; i < len; i++) {
+    c->st.b[j++] ^= ((const uint8_t*)data)[i];
+    if (j >= c->rsiz) {
+      sha3_keccakf(c->st.q);
+      j = 0;
+    }
+  }
+  c->pt = j;
+
+  return 1;
+}
+
+// finalize and output a hash
+
+int sha3_final(void* md, sha3_ctx_t* c) {
+  int i;
+
+  c->st.b[c->pt] ^= 0x06;
+  c->st.b[c->rsiz - 1] ^= 0x80;
+  sha3_keccakf(c->st.q);
+
+  for (i = 0; i < c->mdlen; i++) {
+    ((uint8_t*)md)[i] = c->st.b[i];
+  }
+
+  return 1;
+}
+
+// compute a SHA-3 hash (md) of given byte length from "in"
+
+void* sha3(const void* in, size_t inlen, void* md, int mdlen) {
+  sha3_ctx_t sha3;
+
+  sha3_init(&sha3, mdlen);
+  sha3_update(&sha3, in, inlen);
+  sha3_final(md, &sha3);
+
+  return md;
+}
+
+// SHAKE128 and SHAKE256 extensible-output functionality
+
+void shake_xof(sha3_ctx_t* c) {
+  c->st.b[c->pt] ^= 0x1F;
+  c->st.b[c->rsiz - 1] ^= 0x80;
+  sha3_keccakf(c->st.q);
+  c->pt = 0;
+}
+
+void shake_out(sha3_ctx_t* c, void* out, size_t len) {
+  size_t i;
+  int j;
+
+  j = c->pt;
+  for (i = 0; i < len; i++) {
+    if (j >= c->rsiz) {
+      sha3_keccakf(c->st.q);
+      j = 0;
+    }
+    ((uint8_t*)out)[i] = c->st.b[j++];
+  }
+  c->pt = j;
+}
--- a/spqlios/lib/test/testlib/sha3.h
+++ b/spqlios/lib/test/testlib/sha3.h
@@ -0,0 +1,56 @@
+// sha3.h
+// 19-Nov-11  Markku-Juhani O. Saarinen <mjos@iki.fi>
+// https://github.com/mjosaarinen/tiny_sha3
+// License: MIT
+
+#ifndef SHA3_H
+#define SHA3_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stddef.h>
+#include <stdint.h>
+
+#ifndef KECCAKF_ROUNDS
+#define KECCAKF_ROUNDS 24
+#endif
+
+#ifndef ROTL64
+#define ROTL64(x, y) (((x) << (y)) | ((x) >> (64 - (y))))
+#endif
+
+// state context
+typedef struct {
+  union {            // state:
+    uint8_t b[200];  // 8-bit bytes
+    uint64_t q[25];  // 64-bit words
+  } st;
+  int pt, rsiz, mdlen;  // these don't overflow
+} sha3_ctx_t;
+
+// Compression function.
+void sha3_keccakf(uint64_t st[25]);
+
+// OpenSSL - like interfece
+int sha3_init(sha3_ctx_t* c, int mdlen);  // mdlen = hash output in bytes
+int sha3_update(sha3_ctx_t* c, const void* data, size_t len);
+int sha3_final(void* md, sha3_ctx_t* c);  // digest goes to md
+
+// compute a sha3 hash (md) of given byte length from "in"
+void* sha3(const void* in, size_t inlen, void* md, int mdlen);
+
+// SHAKE128 and SHAKE256 extensible-output functions
+#define shake128_init(c) sha3_init(c, 16)
+#define shake256_init(c) sha3_init(c, 32)
+#define shake_update sha3_update
+
+void shake_xof(sha3_ctx_t* c);
+void shake_out(sha3_ctx_t* c, void* out, size_t len);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // SHA3_H
--- a/spqlios/lib/test/testlib/test_commons.cpp
+++ b/spqlios/lib/test/testlib/test_commons.cpp
@@ -0,0 +1,10 @@
+#include "test_commons.h"
+
+#include <inttypes.h>
+
+std::ostream& operator<<(std::ostream& out, __int128_t x) {
+  char c[35] = {0};
+  snprintf(c, 35, "0x%016" PRIx64 "%016" PRIx64, uint64_t(x >> 64), uint64_t(x));
+  return out << c;
+}
+std::ostream& operator<<(std::ostream& out, __uint128_t x) { return out << __int128_t(x); }
--- a/spqlios/lib/test/testlib/test_commons.h
+++ b/spqlios/lib/test/testlib/test_commons.h
@@ -0,0 +1,74 @@
+#ifndef SPQLIOS_TEST_COMMONS_H
+#define SPQLIOS_TEST_COMMONS_H
+
+#include <iostream>
+#include <random>
+
+#include "../../spqlios/commons.h"
+
+/** @brief macro that crashes if the condition are not met */
+#define REQUIRE_DRAMATICALLY(req_contition, error_msg)                                                        \
+  do {                                                                                                        \
+    if (!(req_contition)) {                                                                                   \
+      std::cerr << "REQUIREMENT FAILED at " << __FILE__ << ":" << __LINE__ << ": " << error_msg << std::endl; \
+      abort();                                                                                                \
+    }                                                                                                         \
+  } while (0)
+
+typedef std::default_random_engine test_rng;
+/** @brief reference to the default test rng */
+test_rng& randgen();
+/** @brief uniformly random 64-bit uint */
+uint64_t uniform_u64();
+/** @brief uniformly random number <= 2^nbits-1 */
+uint64_t uniform_u64_bits(uint64_t nbits);
+/** @brief uniformly random signed 64-bit number */
+int64_t uniform_i64();
+/** @brief uniformly random signed |number| <= 2^nbits */
+int64_t uniform_i64_bits(uint64_t nbits);
+/** @brief uniformly random signed lb <= number <= ub */
+int64_t uniform_i64_bounds(const int64_t lb, const int64_t ub);
+/** @brief uniformly random signed lb <= number <= ub */
+__int128_t uniform_i128_bounds(const __int128_t lb, const __int128_t ub);
+/** @brief uniformly random gaussian float64 */
+double random_f64_gaussian(double stdev = 1);
+/** @brief uniformly random signed lb <= number <= ub */
+double uniform_f64_bounds(const double lb, const double ub);
+/** @brief uniformly random float64 in [0,1] */
+double uniform_f64_01();
+/** @brief random gaussian float64 */
+double random_f64_gaussian(double stdev);
+
+bool is_pow2(uint64_t n);
+
+void* alloc64(uint64_t size);
+
+typedef __uint128_t thash;
+/** @brief returns some pseudorandom hash of a contiguous content */
+thash test_hash(const void* data, uint64_t size);
+/** @brief class to return a pseudorandom hash of a piecewise-defined content */
+class test_hasher {
+  void* md;
+ public:
+  test_hasher();
+  test_hasher(const test_hasher&) = delete;
+  void operator=(const test_hasher&) = delete;
+  /**
+   * @brief append input bytes.
+   * The final hash only depends on the concatenation of bytes, not on the
+   * way the content was split into multiple calls to update.
+   */
+  void update(const void* data, uint64_t size);
+  /**
+   * @brief returns the final hash.
+   * no more calls to update(...) shall be issued after this call.
+   */
+  thash hash();
+  ~test_hasher();
+};
+
+// not included by default, since it makes some versions of gtest not compile
+// std::ostream& operator<<(std::ostream& out, __int128_t x);
+// std::ostream& operator<<(std::ostream& out, __uint128_t x);
+
+#endif  // SPQLIOS_TEST_COMMONS_H
--- a/spqlios/lib/test/testlib/test_hash.cpp
+++ b/spqlios/lib/test/testlib/test_hash.cpp
@@ -0,0 +1,24 @@
+#include "sha3.h"
+#include "test_commons.h"
+
+/** @brief returns some pseudorandom hash of the content */
+thash test_hash(const void* data, uint64_t size) {
+  thash res;
+  sha3(data, size, &res, sizeof(res));
+  return res;
+}
+/** @brief class to return a pseudorandom hash of the content */
+test_hasher::test_hasher() {
+  md = malloc(sizeof(sha3_ctx_t));
+  sha3_init((sha3_ctx_t*)md, 16);
+}
+
+void test_hasher::update(const void* data, uint64_t size) { sha3_update((sha3_ctx_t*)md, data, size); }
+
+thash test_hasher::hash() {
+  thash res;
+  sha3_final(&res, (sha3_ctx_t*)md);
+  return res;
+}
+
+test_hasher::~test_hasher() { free(md); }
--- a/spqlios/lib/test/testlib/vec_rnx_layout.cpp
+++ b/spqlios/lib/test/testlib/vec_rnx_layout.cpp
@@ -0,0 +1,182 @@
+#include "vec_rnx_layout.h"
+
+#include <cstring>
+
+#include "../../spqlios/arithmetic/vec_rnx_arithmetic.h"
+
+#ifdef VALGRIND_MEM_TESTS
+#include "valgrind/memcheck.h"
+#endif
+
+#define CANARY_PADDING (1024)
+#define GARBAGE_VALUE (242)
+
+rnx_vec_f64_layout::rnx_vec_f64_layout(uint64_t n, uint64_t size, uint64_t slice) : n(n), size(size), slice(slice) {
+  REQUIRE_DRAMATICALLY(is_pow2(n), "not a power of 2" << n);
+  REQUIRE_DRAMATICALLY(slice >= n, "slice too small" << slice << " < " << n);
+  this->region = (uint8_t*)malloc(size * slice * sizeof(int64_t) + 2 * CANARY_PADDING);
+  this->data_start = (double*)(region + CANARY_PADDING);
+  // ensure that any invalid value is kind-of garbage
+  memset(region, GARBAGE_VALUE, size * slice * sizeof(int64_t) + 2 * CANARY_PADDING);
+  // mark inter-slice memory as not accessible
+#ifdef VALGRIND_MEM_TESTS
+  VALGRIND_MAKE_MEM_NOACCESS(region, CANARY_PADDING);
+  VALGRIND_MAKE_MEM_NOACCESS(region + size * slice * sizeof(int64_t) + CANARY_PADDING, CANARY_PADDING);
+  for (uint64_t i = 0; i < size; ++i) {
+    VALGRIND_MAKE_MEM_UNDEFINED(data_start + i * slice, n * sizeof(int64_t));
+  }
+  if (size != slice) {
+    for (uint64_t i = 0; i < size; ++i) {
+      VALGRIND_MAKE_MEM_NOACCESS(data_start + i * slice + n, (slice - n) * sizeof(int64_t));
+    }
+  }
+#endif
+}
+
+rnx_vec_f64_layout::~rnx_vec_f64_layout() { free(region); }
+
+rnx_f64 rnx_vec_f64_layout::get_copy_zext(uint64_t index) const {
+  if (index < size) {
+    return rnx_f64(n, data_start + index * slice);
+  } else {
+    return rnx_f64::zero(n);
+  }
+}
+
+rnx_f64 rnx_vec_f64_layout::get_copy(uint64_t index) const {
+  REQUIRE_DRAMATICALLY(index < size, "index overflow: " << index << " / " << size);
+  return rnx_f64(n, data_start + index * slice);
+}
+
+reim_fft64vec rnx_vec_f64_layout::get_dft_copy_zext(uint64_t index) const {
+  if (index < size) {
+    return reim_fft64vec(n, data_start + index * slice);
+  } else {
+    return reim_fft64vec::zero(n);
+  }
+}
+
+reim_fft64vec rnx_vec_f64_layout::get_dft_copy(uint64_t index) const {
+  REQUIRE_DRAMATICALLY(index < size, "index overflow: " << index << " / " << size);
+  return reim_fft64vec(n, data_start + index * slice);
+}
+
+void rnx_vec_f64_layout::set(uint64_t index, const rnx_f64& elem) {
+  REQUIRE_DRAMATICALLY(index < size, "index overflow: " << index << " / " << size);
+  REQUIRE_DRAMATICALLY(elem.nn() == n, "incompatible ring dimensions: " << elem.nn() << " / " << n);
+  elem.save_as(data_start + index * slice);
+}
+
+double* rnx_vec_f64_layout::data() { return data_start; }
+const double* rnx_vec_f64_layout::data() const { return data_start; }
+
+void rnx_vec_f64_layout::fill_random(double log2bound) {
+  for (uint64_t i = 0; i < size; ++i) {
+    set(i, rnx_f64::random_log2bound(n, log2bound));
+  }
+}
+
+thash rnx_vec_f64_layout::content_hash() const {
+  test_hasher hasher;
+  for (uint64_t i = 0; i < size; ++i) {
+    hasher.update(data() + i * slice, n * sizeof(int64_t));
+  }
+  return hasher.hash();
+}
+
+fft64_rnx_vmp_pmat_layout::fft64_rnx_vmp_pmat_layout(uint64_t n, uint64_t nrows, uint64_t ncols)
+    : nn(n),
+      nrows(nrows),
+      ncols(ncols),  //
+      data((RNX_VMP_PMAT*)alloc64(nrows * ncols * nn * 8)) {}
+
+double* fft64_rnx_vmp_pmat_layout::get_addr(uint64_t row, uint64_t col, uint64_t blk) const {
+  REQUIRE_DRAMATICALLY(row < nrows, "row overflow: " << row << " / " << nrows);
+  REQUIRE_DRAMATICALLY(col < ncols, "col overflow: " << col << " / " << ncols);
+  REQUIRE_DRAMATICALLY(blk < nn / 8, "block overflow: " << blk << " / " << (nn / 8));
+  double* d = (double*)data;
+  if (col == (ncols - 1) && (ncols % 2 == 1)) {
+    // special case: last column out of an odd column number
+    return d + blk * nrows * ncols * 8  // major: blk
+           + col * nrows * 8            // col == ncols-1
+           + row * 8;
+  } else {
+    // general case: columns go by pair
+    return d + blk * nrows * ncols * 8    // major: blk
+           + (col / 2) * (2 * nrows) * 8  // second: col pair index
+           + row * 2 * 8                  // third: row index
+           + (col % 2) * 8;               // minor: col in colpair
+  }
+}
+
+reim4_elem fft64_rnx_vmp_pmat_layout::get(uint64_t row, uint64_t col, uint64_t blk) const {
+  return reim4_elem(get_addr(row, col, blk));
+}
+reim4_elem fft64_rnx_vmp_pmat_layout::get_zext(uint64_t row, uint64_t col, uint64_t blk) const {
+  REQUIRE_DRAMATICALLY(blk < nn / 8, "block overflow: " << blk << " / " << (nn / 8));
+  if (row < nrows && col < ncols) {
+    return reim4_elem(get_addr(row, col, blk));
+  } else {
+    return reim4_elem::zero();
+  }
+}
+void fft64_rnx_vmp_pmat_layout::set(uint64_t row, uint64_t col, uint64_t blk, const reim4_elem& value) const {
+  value.save_as(get_addr(row, col, blk));
+}
+
+fft64_rnx_vmp_pmat_layout::~fft64_rnx_vmp_pmat_layout() { spqlios_free(data); }
+
+reim_fft64vec fft64_rnx_vmp_pmat_layout::get_zext(uint64_t row, uint64_t col) const {
+  if (row >= nrows || col >= ncols) {
+    return reim_fft64vec::zero(nn);
+  }
+  if (nn < 8) {
+    // the pmat is just col major
+    double* addr = (double*)data + (row + col * nrows) * nn;
+    return reim_fft64vec(nn, addr);
+  }
+  // otherwise, reconstruct it block by block
+  reim_fft64vec res(nn);
+  for (uint64_t blk = 0; blk < nn / 8; ++blk) {
+    reim4_elem v = get(row, col, blk);
+    res.set_blk(blk, v);
+  }
+  return res;
+}
+void fft64_rnx_vmp_pmat_layout::set(uint64_t row, uint64_t col, const reim_fft64vec& value) {
+  REQUIRE_DRAMATICALLY(row < nrows, "row overflow: " << row << " / " << nrows);
+  REQUIRE_DRAMATICALLY(col < ncols, "row overflow: " << col << " / " << ncols);
+  if (nn < 8) {
+    // the pmat is just col major
+    double* addr = (double*)data + (row + col * nrows) * nn;
+    value.save_as(addr);
+    return;
+  }
+  // otherwise, reconstruct it block by block
+  for (uint64_t blk = 0; blk < nn / 8; ++blk) {
+    reim4_elem v = value.get_blk(blk);
+    set(row, col, blk, v);
+  }
+}
+void fft64_rnx_vmp_pmat_layout::fill_random(double log2bound) {
+  for (uint64_t row = 0; row < nrows; ++row) {
+    for (uint64_t col = 0; col < ncols; ++col) {
+      set(row, col, reim_fft64vec::random(nn, log2bound));
+    }
+  }
+}
+
+fft64_rnx_svp_ppol_layout::fft64_rnx_svp_ppol_layout(uint64_t n)
+    : nn(n),  //
+      data((RNX_SVP_PPOL*)alloc64(nn * 8)) {}
+
+reim_fft64vec fft64_rnx_svp_ppol_layout::get_copy() const { return reim_fft64vec(nn, (double*)data); }
+
+void fft64_rnx_svp_ppol_layout::set(const reim_fft64vec& value) { value.save_as((double*)data); }
+
+void fft64_rnx_svp_ppol_layout::fill_dft_random(uint64_t log2bound) { set(reim_fft64vec::dft_random(nn, log2bound)); }
+
+void fft64_rnx_svp_ppol_layout::fill_random(double log2bound) { set(reim_fft64vec::random(nn, log2bound)); }
+
+fft64_rnx_svp_ppol_layout::~fft64_rnx_svp_ppol_layout() { spqlios_free(data); }
+thash fft64_rnx_svp_ppol_layout::content_hash() const { return test_hash(data, nn * sizeof(double)); }
--- a/spqlios/lib/test/testlib/vec_rnx_layout.h
+++ b/spqlios/lib/test/testlib/vec_rnx_layout.h
@@ -0,0 +1,85 @@
+#ifndef SPQLIOS_EXT_VEC_RNX_LAYOUT_H
+#define SPQLIOS_EXT_VEC_RNX_LAYOUT_H
+
+#include "../../spqlios/arithmetic/vec_rnx_arithmetic.h"
+#include "fft64_dft.h"
+#include "negacyclic_polynomial.h"
+#include "reim4_elem.h"
+#include "test_commons.h"
+
+/** @brief a test memory layout for rnx i64 polynomials vectors */
+class rnx_vec_f64_layout {
+  uint64_t n;
+  uint64_t size;
+  uint64_t slice;
+  double* data_start;
+  uint8_t* region;
+
+ public:
+  // NO-COPY structure
+  rnx_vec_f64_layout(const rnx_vec_f64_layout&) = delete;
+  void operator=(const rnx_vec_f64_layout&) = delete;
+  rnx_vec_f64_layout(rnx_vec_f64_layout&&) = delete;
+  void operator=(rnx_vec_f64_layout&&) = delete;
+  /** @brief initialises a memory layout */
+  rnx_vec_f64_layout(uint64_t n, uint64_t size, uint64_t slice);
+  /** @brief destructor */
+  ~rnx_vec_f64_layout();
+
+  /** @brief get a copy of item index index (extended with zeros) */
+  rnx_f64 get_copy_zext(uint64_t index) const;
+  /** @brief get a copy of item index index (extended with zeros) */
+  rnx_f64 get_copy(uint64_t index) const;
+  /** @brief get a copy of item index index (extended with zeros) */
+  reim_fft64vec get_dft_copy_zext(uint64_t index) const;
+  /** @brief get a copy of item index index (extended with zeros) */
+  reim_fft64vec get_dft_copy(uint64_t index) const;
+
+  /** @brief get a copy of item index index (index<size) */
+  void set(uint64_t index, const rnx_f64& elem);
+  /** @brief fill with random values */
+  void fill_random(double log2bound = 0);
+  /** @brief raw pointer access */
+  double* data();
+  /** @brief raw pointer access (const version) */
+  const double* data() const;
+  /** @brief content hashcode */
+  thash content_hash() const;
+};
+
+/** @brief test layout for the VMP_PMAT */
+class fft64_rnx_vmp_pmat_layout {
+ public:
+  const uint64_t nn;
+  const uint64_t nrows;
+  const uint64_t ncols;
+  RNX_VMP_PMAT* const data;
+  fft64_rnx_vmp_pmat_layout(uint64_t n, uint64_t nrows, uint64_t ncols);
+  double* get_addr(uint64_t row, uint64_t col, uint64_t blk) const;
+  reim4_elem get(uint64_t row, uint64_t col, uint64_t blk) const;
+  thash content_hash() const;
+  reim4_elem get_zext(uint64_t row, uint64_t col, uint64_t blk) const;
+  reim_fft64vec get_zext(uint64_t row, uint64_t col) const;
+  void set(uint64_t row, uint64_t col, uint64_t blk, const reim4_elem& v) const;
+  void set(uint64_t row, uint64_t col, const reim_fft64vec& value);
+  /** @brief fill with random double values (unstructured) */
+  void fill_random(double log2bound);
+  ~fft64_rnx_vmp_pmat_layout();
+};
+
+/** @brief test layout for the SVP_PPOL */
+class fft64_rnx_svp_ppol_layout {
+ public:
+  const uint64_t nn;
+  RNX_SVP_PPOL* const data;
+  fft64_rnx_svp_ppol_layout(uint64_t n);
+  thash content_hash() const;
+  reim_fft64vec get_copy() const;
+  void set(const reim_fft64vec&);
+  /** @brief fill with random double values (unstructured) */
+  void fill_random(double log2bound);
+  /** @brief fill with random ffts of small int polynomials */
+  void fill_dft_random(uint64_t log2bound);
+  ~fft64_rnx_svp_ppol_layout();
+};
+#endif  // SPQLIOS_EXT_VEC_RNX_LAYOUT_H
--- a/spqlios/lib/test/testlib/zn_layouts.cpp
+++ b/spqlios/lib/test/testlib/zn_layouts.cpp
@@ -0,0 +1,55 @@
+#include "zn_layouts.h"
+
+zn32_pmat_layout::zn32_pmat_layout(uint64_t nrows, uint64_t ncols)
+    : nrows(nrows),  //
+      ncols(ncols),  //
+      data((ZN32_VMP_PMAT*)malloc((nrows * ncols + 7) * sizeof(int32_t))) {}
+
+zn32_pmat_layout::~zn32_pmat_layout() { free(data); }
+
+int32_t* zn32_pmat_layout::get_addr(uint64_t row, uint64_t col) const {
+  REQUIRE_DRAMATICALLY(row < nrows, "row overflow" << row << " / " << nrows);
+  REQUIRE_DRAMATICALLY(col < ncols, "col overflow" << col << " / " << ncols);
+  const uint64_t nblk = ncols >> 5;
+  const uint64_t rem_ncols = ncols & 31;
+  uint64_t blk = col >> 5;
+  uint64_t col_rem = col & 31;
+  if (blk < nblk) {
+    // column is part of a full block
+    return (int32_t*)data + blk * nrows * 32 + row * 32 + col_rem;
+  } else {
+    // column is part of the last block
+    return (int32_t*)data + blk * nrows * 32 + row * rem_ncols + col_rem;
+  }
+}
+int32_t zn32_pmat_layout::get(uint64_t row, uint64_t col) const { return *get_addr(row, col); }
+int32_t zn32_pmat_layout::get_zext(uint64_t row, uint64_t col) const {
+  if (row >= nrows || col >= ncols) return 0;
+  return *get_addr(row, col);
+}
+void zn32_pmat_layout::set(uint64_t row, uint64_t col, int32_t value) { *get_addr(row, col) = value; }
+void zn32_pmat_layout::fill_random() {
+  int32_t* d = (int32_t*)data;
+  for (uint64_t i = 0; i < nrows * ncols; ++i) d[i] = uniform_i64_bits(32);
+}
+thash zn32_pmat_layout::content_hash() const { return test_hash(data, nrows * ncols * sizeof(int32_t)); }
+
+template <typename T>
+std::vector<int32_t> vmp_product(const T* vec, uint64_t vec_size, uint64_t out_size, const zn32_pmat_layout& mat) {
+  uint64_t rows = std::min(vec_size, mat.nrows);
+  uint64_t cols = std::min(out_size, mat.ncols);
+  std::vector<int32_t> res(out_size, 0);
+  for (uint64_t j = 0; j < cols; ++j) {
+    for (uint64_t i = 0; i < rows; ++i) {
+      res[j] += vec[i] * mat.get(i, j);
+    }
+  }
+  return res;
+}
+
+template std::vector<int32_t> vmp_product(const int8_t* vec, uint64_t vec_size, uint64_t out_size,
+                                          const zn32_pmat_layout& mat);
+template std::vector<int32_t> vmp_product(const int16_t* vec, uint64_t vec_size, uint64_t out_size,
+                                          const zn32_pmat_layout& mat);
+template std::vector<int32_t> vmp_product(const int32_t* vec, uint64_t vec_size, uint64_t out_size,
+                                          const zn32_pmat_layout& mat);
--- a/spqlios/lib/test/testlib/zn_layouts.h
+++ b/spqlios/lib/test/testlib/zn_layouts.h
@@ -0,0 +1,29 @@
+#ifndef SPQLIOS_EXT_ZN_LAYOUTS_H
+#define SPQLIOS_EXT_ZN_LAYOUTS_H
+
+#include "../../spqlios/arithmetic/zn_arithmetic.h"
+#include "test_commons.h"
+
+class zn32_pmat_layout {
+ public:
+  const uint64_t nrows;
+  const uint64_t ncols;
+  ZN32_VMP_PMAT* const data;
+  zn32_pmat_layout(uint64_t nrows, uint64_t ncols);
+
+ private:
+  int32_t* get_addr(uint64_t row, uint64_t col) const;
+
+ public:
+  int32_t get(uint64_t row, uint64_t col) const;
+  int32_t get_zext(uint64_t row, uint64_t col) const;
+  void set(uint64_t row, uint64_t col, int32_t value);
+  void fill_random();
+  thash content_hash() const;
+  ~zn32_pmat_layout();
+};
+
+template <typename T>
+std::vector<int32_t> vmp_product(const T* vec, uint64_t vec_size, uint64_t out_size, const zn32_pmat_layout& mat);
+
+#endif  // SPQLIOS_EXT_ZN_LAYOUTS_H