From 881483d1bbc148581d23edf7b8440d01159b4357 Mon Sep 17 00:00:00 2001 From: Jean-Philippe Bossuat Date: Sun, 26 Oct 2025 16:32:22 +0100 Subject: [PATCH] wip --- .../src/cpu_fft64_avx/vec_znx_dft.rs | 4 +- poulpy-backend/src/cpu_fft64_ref/tests.rs | 2 +- .../src/cpu_fft64_ref/vec_znx_dft.rs | 4 +- .../src/cpu_spqlios/fft64/vec_znx_dft.rs | 6 +- poulpy-hal/src/api/convolution.rs | 161 +++++++++++------- poulpy-hal/src/api/vec_znx_dft.rs | 2 +- poulpy-hal/src/delegates/vec_znx_dft.rs | 4 +- poulpy-hal/src/oep/vec_znx_dft.rs | 2 +- poulpy-hal/src/reference/fft64/vec_znx_dft.rs | 9 +- poulpy-hal/src/test_suite/convolution.rs | 102 ++++++----- .../algorithms/cggi/algorithm.rs | 17 +- 11 files changed, 173 insertions(+), 140 deletions(-) diff --git a/poulpy-backend/src/cpu_fft64_avx/vec_znx_dft.rs b/poulpy-backend/src/cpu_fft64_avx/vec_znx_dft.rs index 57ffc6f..1e1954e 100644 --- a/poulpy-backend/src/cpu_fft64_avx/vec_znx_dft.rs +++ b/poulpy-backend/src/cpu_fft64_avx/vec_znx_dft.rs @@ -194,10 +194,10 @@ unsafe impl VecZnxDftCopyImpl for FFT64Avx { } unsafe impl VecZnxDftZeroImpl for FFT64Avx { - fn vec_znx_dft_zero_impl(_module: &Module, res: &mut R) + fn vec_znx_dft_zero_impl(_module: &Module, res: &mut R, res_col: usize) where R: VecZnxDftToMut, { - vec_znx_dft_zero(res); + vec_znx_dft_zero(res, res_col); } } diff --git a/poulpy-backend/src/cpu_fft64_ref/tests.rs b/poulpy-backend/src/cpu_fft64_ref/tests.rs index 3f824c3..4531117 100644 --- a/poulpy-backend/src/cpu_fft64_ref/tests.rs +++ b/poulpy-backend/src/cpu_fft64_ref/tests.rs @@ -4,6 +4,6 @@ use crate::FFT64Ref; #[test] fn test_convolution_fft64_ref() { - let module: Module = Module::::new(64); + let module: Module = Module::::new(8); test_convolution(&module); } diff --git a/poulpy-backend/src/cpu_fft64_ref/vec_znx_dft.rs b/poulpy-backend/src/cpu_fft64_ref/vec_znx_dft.rs index 5ad6400..b6ee4dd 100644 --- a/poulpy-backend/src/cpu_fft64_ref/vec_znx_dft.rs +++ b/poulpy-backend/src/cpu_fft64_ref/vec_znx_dft.rs @@ -194,10 +194,10 @@ unsafe impl VecZnxDftCopyImpl for FFT64Ref { } unsafe impl VecZnxDftZeroImpl for FFT64Ref { - fn vec_znx_dft_zero_impl(_module: &Module, res: &mut R) + fn vec_znx_dft_zero_impl(_module: &Module, res: &mut R, res_col: usize) where R: VecZnxDftToMut, { - vec_znx_dft_zero(res); + vec_znx_dft_zero(res, res_col); } } diff --git a/poulpy-backend/src/cpu_spqlios/fft64/vec_znx_dft.rs b/poulpy-backend/src/cpu_spqlios/fft64/vec_znx_dft.rs index 3b67089..cdffb41 100644 --- a/poulpy-backend/src/cpu_spqlios/fft64/vec_znx_dft.rs +++ b/poulpy-backend/src/cpu_spqlios/fft64/vec_znx_dft.rs @@ -12,7 +12,7 @@ use poulpy_hal::{ reference::{ fft64::{ reim::{ReimCopy, ReimZero, reim_copy_ref, reim_negate_inplace_ref, reim_negate_ref, reim_zero_ref}, - vec_znx_dft::vec_znx_dft_copy, + vec_znx_dft::{vec_znx_dft_copy, vec_znx_dft_zero}, }, znx::znx_zero_ref, }, @@ -426,10 +426,10 @@ impl ReimZero for FFT64Spqlios { } unsafe impl VecZnxDftZeroImpl for FFT64Spqlios { - fn vec_znx_dft_zero_impl(_module: &Module, res: &mut R) + fn vec_znx_dft_zero_impl(_module: &Module, res: &mut R, res_col: usize) where R: VecZnxDftToMut, { - res.to_mut().data.fill(0); + vec_znx_dft_zero(res, res_col); } } diff --git a/poulpy-hal/src/api/convolution.rs b/poulpy-hal/src/api/convolution.rs index 2f32de2..9b34ead 100644 --- a/poulpy-hal/src/api/convolution.rs +++ b/poulpy-hal/src/api/convolution.rs @@ -1,9 +1,9 @@ use crate::{ api::{ ModuleN, ScratchTakeBasic, SvpApplyDftToDft, SvpPPolAlloc, SvpPPolBytesOf, SvpPrepare, VecZnxDftAddScaledInplace, - VecZnxDftBytesOf, + VecZnxDftBytesOf, VecZnxDftZero, }, - layouts::{Backend, Module, Scratch, VecZnxDftToMut, VecZnxDftToRef, VecZnxToRef, ZnxInfos, ZnxZero}, + layouts::{Backend, Module, Scratch, VecZnxDftToMut, VecZnxDftToRef, VecZnxToRef, ZnxInfos}, }; impl Convolution for Module @@ -15,7 +15,8 @@ where + SvpPrepare + SvpPPolBytesOf + VecZnxDftBytesOf - + VecZnxDftAddScaledInplace, + + VecZnxDftAddScaledInplace + + VecZnxDftZero, Scratch: ScratchTakeBasic, { } @@ -29,46 +30,15 @@ where + SvpPrepare + SvpPPolBytesOf + VecZnxDftBytesOf - + VecZnxDftAddScaledInplace, + + VecZnxDftAddScaledInplace + + VecZnxDftZero, Scratch: ScratchTakeBasic, { - fn convolution_tmp_bytes(&self, res_size: usize) -> usize { - self.bytes_of_svp_ppol(1) + self.bytes_of_vec_znx_dft(1, res_size) + fn convolution_tmp_bytes(&self, b_size: usize) -> usize { + self.bytes_of_svp_ppol(1) + self.bytes_of_vec_znx_dft(1, b_size) } - /// Evaluates a bivariate convolution over Z[X, Y] / (X^N + 1) where Y = 2^-K - /// and scales the result by 2^{res_scale * K} - /// - /// # Example - /// a = [a00, a10, a20, a30] = (a00 * 2^-K + a01 * 2^-2K) + (a10 * 2^-K + a11 * 2^-2K) * X ... - /// [a01, a11, a21, a31] - /// - /// b = [b00, b10, b20, b30] = (b00 * 2^-K + b01 * 2^-2K) + (b10 * 2^-K + b11 * 2^-2K) * X ... - /// [b01, b11, b21, b31] - /// - /// If res_scale = 0: - /// res = [ 0, 0, 0, 0] = (r01 * 2^-2K + r02 * 2^-3K + r03 * 2^-4K + r04 * 2^-5K) + ... - /// [r01, r11, r21, r31] - /// [r02, r12, r22, r32] - /// [r03, r13, r23, r33] - /// [r04, r14, r24, r34] - /// - /// If res_scale = 1: - /// res = [r01, r11, r21, r31] = (r01 * 2^-K + r02 * 2^-2K + r03 * 2^-3K + r04 * 2^-4K + r05 * 2^-5K) + ... - /// [r02, r12, r22, r32] - /// [r03, r13, r23, r33] - /// [r04, r14, r24, r34] - /// [r05, r15, r25, r35] - /// - /// If res_scale = -1: - /// res = [ 0, 0, 0, 0] = (r01 * 2^-3K + r02 * 2^-4K + r03 * 2^-5K) + ... - /// [ 0, 0, 0, 0] - /// [r01, r11, r21, r31] - /// [r02, r12, r22, r32] - /// [r03, r13, r23, r33] - /// - /// If res.size() < a.size() + b.size() + 1 + res_scale, result is truncated accordingly in the Y dimension. - fn convolution(&self, res: &mut R, res_scale: i64, a: &A, b: &B, scratch: &mut Scratch) + fn bivariate_convolution_full(&self, k: i64, res: &mut R, a: &A, b: &B, scratch: &mut Scratch) where R: VecZnxDftToMut, A: VecZnxToRef, @@ -78,32 +48,99 @@ where let a: &crate::layouts::VecZnx<&[u8]> = &a.to_ref(); let b: &crate::layouts::VecZnxDft<&[u8], BE> = &b.to_ref(); - assert!(res.cols() >= a.cols() + b.cols() - 1); + let res_cols: usize = res.cols(); + let a_cols: usize = a.cols(); + let b_cols: usize = b.cols(); - res.zero(); + assert!(res_cols >= a_cols + b_cols - 1); - let (mut ppol, scratch_1) = scratch.take_svp_ppol(self, 1); - let (mut res_tmp, _) = scratch_1.take_vec_znx_dft(self, 1, res.size()); - - for a_col in 0..a.cols() { - for a_limb in 0..a.size() { - // Prepares the j-th limb of the i-th col of A - self.svp_prepare(&mut ppol, 0, &a.as_scalar_znx_ref(a_col, a_limb), 0); - - for b_col in 0..b.cols() { - // Multiplies with the i-th col of B - self.svp_apply_dft_to_dft(&mut res_tmp, 0, &ppol, 0, b, b_col); - - // Adds on the [a_col + b_col] of res, scaled by 2^{-(a_limb + 1) * Base2K} - self.vec_znx_dft_add_scaled_inplace( - res, - a_col + b_col, - &res_tmp, - 0, - -(1 + a_limb as i64) + res_scale, - ); - } + for res_col in 0..res_cols { + let a_min: usize = res_col.saturating_sub(b_cols - 1); + let a_max: usize = res_col.min(a_cols - 1); + self.bivariate_convolution_single(k, res, res_col, a, a_min, b, res_col - a_min, scratch); + for a_col in a_min + 1..a_max + 1 { + self.bivariate_convolution_single_add(k, res, res_col, a, a_col, b, res_col - a_col, scratch); } } } + + /// Evaluates a bivariate convolution over Z[X, Y] / (X^N + 1) where Y = 2^-K over the + /// selected columsn and stores the result on the selected column, scaled by 2^{k * Base2K} + /// + /// # Example + /// a = [a00, a10, a20, a30] = (a00 * 2^-K + a01 * 2^-2K) + (a10 * 2^-K + a11 * 2^-2K) * X ... + /// [a01, a11, a21, a31] + /// + /// b = [b00, b10, b20, b30] = (b00 * 2^-K + b01 * 2^-2K) + (b10 * 2^-K + b11 * 2^-2K) * X ... + /// [b01, b11, b21, b31] + /// + /// If k = 0: + /// res = [ 0, 0, 0, 0] = (r01 * 2^-2K + r02 * 2^-3K + r03 * 2^-4K + r04 * 2^-5K) + ... + /// [r01, r11, r21, r31] + /// [r02, r12, r22, r32] + /// [r03, r13, r23, r33] + /// [r04, r14, r24, r34] + /// + /// If k = 1: + /// res = [r01, r11, r21, r31] = (r01 * 2^-K + r02 * 2^-2K + r03 * 2^-3K + r04 * 2^-4K + r05 * 2^-5K) + ... + /// [r02, r12, r22, r32] + /// [r03, r13, r23, r33] + /// [r04, r14, r24, r34] + /// [r05, r15, r25, r35] + /// + /// If k = -1: + /// res = [ 0, 0, 0, 0] = (r01 * 2^-3K + r02 * 2^-4K + r03 * 2^-5K) + ... + /// [ 0, 0, 0, 0] + /// [r01, r11, r21, r31] + /// [r02, r12, r22, r32] + /// [r03, r13, r23, r33] + /// + /// If res.size() < a.size() + b.size() + 1 + k, result is truncated accordingly in the Y dimension. + fn bivariate_convolution_single_add( + &self, + k: i64, + res: &mut R, + res_col: usize, + a: &A, + a_col: usize, + b: &B, + b_col: usize, + scratch: &mut Scratch, + ) where + R: VecZnxDftToMut, + A: VecZnxToRef, + B: VecZnxDftToRef, + { + let res: &mut crate::layouts::VecZnxDft<&mut [u8], BE> = &mut res.to_mut(); + let a: &crate::layouts::VecZnx<&[u8]> = &a.to_ref(); + let b: &crate::layouts::VecZnxDft<&[u8], BE> = &b.to_ref(); + + let (mut ppol, scratch_1) = scratch.take_svp_ppol(self, 1); + let (mut res_tmp, _) = scratch_1.take_vec_znx_dft(self, 1, b.size()); + + for a_limb in 0..a.size() { + self.svp_prepare(&mut ppol, 0, &a.as_scalar_znx_ref(a_col, a_limb), 0); + self.svp_apply_dft_to_dft(&mut res_tmp, 0, &ppol, 0, b, b_col); + self.vec_znx_dft_add_scaled_inplace(res, res_col, &res_tmp, 0, -(1 + a_limb as i64) + k); + } + } + + fn bivariate_convolution_single( + &self, + k: i64, + res: &mut R, + res_col: usize, + a: &A, + a_col: usize, + b: &B, + b_col: usize, + scratch: &mut Scratch, + ) where + R: VecZnxDftToMut, + A: VecZnxToRef, + B: VecZnxDftToRef, + { + self.vec_znx_dft_zero(res, res_col); + self.bivariate_convolution_single_add(k, res, res_col, a, a_col, b, b_col, scratch); + } } diff --git a/poulpy-hal/src/api/vec_znx_dft.rs b/poulpy-hal/src/api/vec_znx_dft.rs index 61396c4..0044c18 100644 --- a/poulpy-hal/src/api/vec_znx_dft.rs +++ b/poulpy-hal/src/api/vec_znx_dft.rs @@ -97,7 +97,7 @@ pub trait VecZnxDftCopy { } pub trait VecZnxDftZero { - fn vec_znx_dft_zero(&self, res: &mut R) + fn vec_znx_dft_zero(&self, res: &mut R, res_col: usize) where R: VecZnxDftToMut; } diff --git a/poulpy-hal/src/delegates/vec_znx_dft.rs b/poulpy-hal/src/delegates/vec_znx_dft.rs index 7dfb25f..3e9cd03 100644 --- a/poulpy-hal/src/delegates/vec_znx_dft.rs +++ b/poulpy-hal/src/delegates/vec_znx_dft.rs @@ -200,10 +200,10 @@ impl VecZnxDftZero for Module where B: Backend + VecZnxDftZeroImpl, { - fn vec_znx_dft_zero(&self, res: &mut R) + fn vec_znx_dft_zero(&self, res: &mut R, res_col: usize) where R: VecZnxDftToMut, { - B::vec_znx_dft_zero_impl(self, res); + B::vec_znx_dft_zero_impl(self, res, res_col); } } diff --git a/poulpy-hal/src/oep/vec_znx_dft.rs b/poulpy-hal/src/oep/vec_znx_dft.rs index f561084..abdb92a 100644 --- a/poulpy-hal/src/oep/vec_znx_dft.rs +++ b/poulpy-hal/src/oep/vec_znx_dft.rs @@ -188,7 +188,7 @@ pub unsafe trait VecZnxDftCopyImpl { /// * See [crate::api::VecZnxDftZero] for corresponding public API. /// # Safety [crate::doc::backend_safety] for safety contract. pub unsafe trait VecZnxDftZeroImpl { - fn vec_znx_dft_zero_impl(module: &Module, res: &mut R) + fn vec_znx_dft_zero_impl(module: &Module, res: &mut R, res_col: usize) where R: VecZnxDftToMut; } diff --git a/poulpy-hal/src/reference/fft64/vec_znx_dft.rs b/poulpy-hal/src/reference/fft64/vec_znx_dft.rs index e8d12e6..fa8d9e1 100644 --- a/poulpy-hal/src/reference/fft64/vec_znx_dft.rs +++ b/poulpy-hal/src/reference/fft64/vec_znx_dft.rs @@ -118,7 +118,7 @@ where } } else if a_scale < 0 { let shift: usize = (a_scale.unsigned_abs() as usize).min(res_size); - let sum_size: usize = a_size.min(res_size).saturating_sub(shift); + let sum_size: usize = a_size.min(res_size.saturating_sub(shift)); for j in 0..sum_size { BE::reim_add_inplace(res.at_mut(res_col, j + shift), a.at(a_col, j)); } @@ -398,10 +398,13 @@ where } } -pub fn vec_znx_dft_zero(res: &mut R) +pub fn vec_znx_dft_zero(res: &mut R, res_col: usize) where R: VecZnxDftToMut, BE: Backend + ReimZero, { - BE::reim_zero(res.to_mut().raw_mut()); + let res: &mut VecZnxDft<&mut [u8], BE> = &mut res.to_mut(); + for j in 0..res.size() { + BE::reim_zero(res.at_mut(res_col, j)) + } } diff --git a/poulpy-hal/src/test_suite/convolution.rs b/poulpy-hal/src/test_suite/convolution.rs index 8f4c71c..05f2df9 100644 --- a/poulpy-hal/src/test_suite/convolution.rs +++ b/poulpy-hal/src/test_suite/convolution.rs @@ -1,7 +1,7 @@ use crate::{ api::{ - Convolution, ModuleN, ScratchOwnedAlloc, ScratchOwnedBorrow, ScratchTakeBasic, TakeSlice, VecZnxBigNormalize, - VecZnxDftAlloc, VecZnxDftApply, VecZnxIdftApplyConsume, VecZnxNormalizeInplace, + Convolution, ModuleN, ScratchOwnedAlloc, ScratchOwnedBorrow, ScratchTakeBasic, TakeSlice, VecZnxBigAlloc, + VecZnxBigNormalize, VecZnxDftAlloc, VecZnxDftApply, VecZnxIdftApplyTmpA, VecZnxNormalizeInplace, }, layouts::{ Backend, FillUniform, Scratch, ScratchOwned, VecZnx, VecZnxBig, VecZnxDft, VecZnxToMut, VecZnxToRef, ZnxInfos, ZnxView, @@ -16,9 +16,10 @@ where + Convolution + VecZnxDftAlloc + VecZnxDftApply - + VecZnxIdftApplyConsume + + VecZnxIdftApplyTmpA + VecZnxBigNormalize - + VecZnxNormalizeInplace, + + VecZnxNormalizeInplace + + VecZnxBigAlloc, Scratch: ScratchTakeBasic, ScratchOwned: ScratchOwnedAlloc + ScratchOwnedBorrow, { @@ -26,70 +27,63 @@ where let base2k: usize = 12; - for a_cols in 1..3 { - for b_cols in 1..3 { - for a_size in 1..5 { - for b_size in 1..5 { - let mut a: VecZnx> = VecZnx::alloc(module.n(), a_cols, a_size); - let mut b: VecZnx> = VecZnx::alloc(module.n(), b_cols, b_size); + let a_cols: usize = 3; + let b_cols: usize = 3; + let a_size: usize = 3; + let b_size: usize = 3; + let c_cols: usize = a_cols + b_cols - 1; + let c_size: usize = a_size + b_size; - let mut c_want: VecZnx> = VecZnx::alloc(module.n(), a_cols + b_cols - 1, b_size + a_size); - let mut c_have: VecZnx> = VecZnx::alloc(module.n(), c_want.cols(), c_want.size()); + let mut a: VecZnx> = VecZnx::alloc(module.n(), a_cols, a_size); + let mut b: VecZnx> = VecZnx::alloc(module.n(), b_cols, b_size); - let mut scratch: ScratchOwned = ScratchOwned::alloc(module.convolution_tmp_bytes(c_want.size())); + let mut c_want: VecZnx> = VecZnx::alloc(module.n(), c_cols, c_size); + let mut c_have: VecZnx> = VecZnx::alloc(module.n(), c_cols, c_size); + let mut c_have_dft: VecZnxDft, BE> = module.vec_znx_dft_alloc(c_cols, c_size); + let mut c_have_big: VecZnxBig, BE> = module.vec_znx_big_alloc(c_cols, c_size); - a.fill_uniform(base2k, &mut source); - b.fill_uniform(base2k, &mut source); + let mut scratch: ScratchOwned = ScratchOwned::alloc(module.convolution_tmp_bytes(b_size)); - let mut b_dft: VecZnxDft, BE> = module.vec_znx_dft_alloc(b.cols(), b.size()); + a.fill_uniform(base2k, &mut source); + b.fill_uniform(base2k, &mut source); - for i in 0..b.cols() { - module.vec_znx_dft_apply(1, 0, &mut b_dft, i, &b, i); - } + let mut b_dft: VecZnxDft, BE> = module.vec_znx_dft_alloc(b_cols, b_size); + for i in 0..b.cols() { + module.vec_znx_dft_apply(1, 0, &mut b_dft, i, &b, i); + } - for mut res_scale in 0..2 * c_want.size() as i64 + 1 { - res_scale -= c_want.size() as i64; + for mut k in 0..(2 * c_size + 1) as i64 { + k -= c_size as i64; - let mut c_have_dft: VecZnxDft, BE> = module.vec_znx_dft_alloc(c_have.cols(), c_have.size()); - module.convolution(&mut c_have_dft, res_scale, &a, &b_dft, scratch.borrow()); + module.bivariate_convolution_full(k, &mut c_have_dft, &a, &b_dft, scratch.borrow()); - let c_have_big: VecZnxBig, BE> = module.vec_znx_idft_apply_consume(c_have_dft); - - for i in 0..c_have.cols() { - module.vec_znx_big_normalize( - base2k, - &mut c_have, - i, - base2k, - &c_have_big, - i, - scratch.borrow(), - ); - } - - convolution_naive( - module, - base2k, - &mut c_want, - res_scale, - &a, - &b, - scratch.borrow(), - ); - - assert_eq!(c_want, c_have); - } - } - } + for i in 0..c_cols { + module.vec_znx_idft_apply_tmpa(&mut c_have_big, i, &mut c_have_dft, i); } + + for i in 0..c_cols { + module.vec_znx_big_normalize( + base2k, + &mut c_have, + i, + base2k, + &c_have_big, + i, + scratch.borrow(), + ); + } + + convolution_naive(module, base2k, k, &mut c_want, &a, &b, scratch.borrow()); + + assert_eq!(c_want, c_have); } } fn convolution_naive( module: &M, base2k: usize, + k: i64, res: &mut R, - res_scale: i64, a: &A, b: &B, scratch: &mut Scratch, @@ -112,11 +106,11 @@ fn convolution_naive( for a_limb in 0..a.size() { for b_col in 0..b.cols() { for b_limb in 0..b.size() { - let res_scale_abs = res_scale.unsigned_abs() as usize; + let res_scale_abs = k.unsigned_abs() as usize; let mut res_limb: usize = a_limb + b_limb + 1; - if res_scale <= 0 { + if k <= 0 { res_limb += res_scale_abs; if res_limb < res.size() { diff --git a/poulpy-schemes/src/tfhe/blind_rotation/algorithms/cggi/algorithm.rs b/poulpy-schemes/src/tfhe/blind_rotation/algorithms/cggi/algorithm.rs index b9ec277..c65db52 100644 --- a/poulpy-schemes/src/tfhe/blind_rotation/algorithms/cggi/algorithm.rs +++ b/poulpy-schemes/src/tfhe/blind_rotation/algorithms/cggi/algorithm.rs @@ -189,12 +189,12 @@ fn execute_block_binary_extended( brk.data.chunks_exact(block_size) ) .for_each(|(ai, ski)| { - (0..extension_factor).for_each(|i| { - (0..cols).for_each(|j| { + for i in 0..extension_factor { + for j in 0..cols { module.vec_znx_dft_apply(1, 0, &mut acc_dft[i], j, &acc[i], j); - }); - module.vec_znx_dft_zero(&mut acc_add_dft[i]) - }); + module.vec_znx_dft_zero(&mut acc_add_dft[i], j) + } + } // TODO: first & last iterations can be optimized izip!(ai.iter(), ski.iter()).for_each(|(aii, skii)| { @@ -342,11 +342,10 @@ fn execute_block_binary( brk.data.chunks_exact(block_size) ) .for_each(|(ai, ski)| { - (0..cols).for_each(|j| { + for j in 0..cols { module.vec_znx_dft_apply(1, 0, &mut acc_dft, j, out_mut.data_mut(), j); - }); - - module.vec_znx_dft_zero(&mut acc_add_dft); + module.vec_znx_dft_zero(&mut acc_add_dft, j) + } izip!(ai.iter(), ski.iter()).for_each(|(aii, skii)| { let ai_pos: usize = ((aii + two_n as i64) & (two_n - 1) as i64) as usize;