refactoring for specific implementations

2026-02-10 13:16:44 +01:00 · 2024-12-20 13:22:35 +01:00
parent a24ad55adc
commit 5dd371f6b0
23 changed files with 1671 additions and 527 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2,24 +2,176 @@
 # It is not intended for manual editing.
 version = 4

+[[package]]
+name = "aho-corasick"
+version = "1.1.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8e60d3430d3a69478ad0993f19238d2df97c507009a52b3c10addcd7f6bcb916"
+dependencies = [
+ "memchr",
+]
+
+[[package]]
+name = "anes"
+version = "0.1.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4b46cbb362ab8752921c97e041f5e366ee6297bd428a31275b9fcf1e380f7299"
+
+[[package]]
+name = "anstyle"
+version = "1.0.10"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "55cc3b69f167a1ef2e161439aa98aed94e6028e5f9a59be9a6ffb47aef1651f9"
+
 [[package]]
 name = "autocfg"
 version = "1.4.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "ace50bade8e6234aa140d9a2f552bbee1db4d353f69b8217bc503490fc1a9f26"

+[[package]]
+name = "bumpalo"
+version = "3.16.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "79296716171880943b8470b5f8d03aa55eb2e645a4874bdbb28adb49162e012c"
+
 [[package]]
 name = "byteorder"
 version = "1.5.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b"

+[[package]]
+name = "cast"
+version = "0.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5"
+
 [[package]]
 name = "cfg-if"
 version = "1.0.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"

+[[package]]
+name = "ciborium"
+version = "0.2.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "42e69ffd6f0917f5c029256a24d0161db17cea3997d185db0d35926308770f0e"
+dependencies = [
+ "ciborium-io",
+ "ciborium-ll",
+ "serde",
+]
+
+[[package]]
+name = "ciborium-io"
+version = "0.2.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "05afea1e0a06c9be33d539b876f1ce3692f4afea2cb41f740e7743225ed1c757"
+
+[[package]]
+name = "ciborium-ll"
+version = "0.2.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "57663b653d948a338bfb3eeba9bb2fd5fcfaecb9e199e87e1eda4d9e8b240fd9"
+dependencies = [
+ "ciborium-io",
+ "half",
+]
+
+[[package]]
+name = "clap"
+version = "4.5.23"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3135e7ec2ef7b10c6ed8950f0f792ed96ee093fa088608f1c76e569722700c84"
+dependencies = [
+ "clap_builder",
+]
+
+[[package]]
+name = "clap_builder"
+version = "4.5.23"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "30582fc632330df2bd26877bde0c1f4470d57c582bbc070376afcd04d8cb4838"
+dependencies = [
+ "anstyle",
+ "clap_lex",
+]
+
+[[package]]
+name = "clap_lex"
+version = "0.7.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f46ad14479a25103f283c0f10005961cf086d8dc42205bb44c46ac563475dca6"
+
+[[package]]
+name = "criterion"
+version = "0.5.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f2b12d017a929603d80db1831cd3a24082f8137ce19c69e6447f54f5fc8d692f"
+dependencies = [
+ "anes",
+ "cast",
+ "ciborium",
+ "clap",
+ "criterion-plot",
+ "is-terminal",
+ "itertools 0.10.5",
+ "num-traits",
+ "once_cell",
+ "oorandom",
+ "plotters",
+ "rayon",
+ "regex",
+ "serde",
+ "serde_derive",
+ "serde_json",
+ "tinytemplate",
+ "walkdir",
+]
+
+[[package]]
+name = "criterion-plot"
+version = "0.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6b50826342786a51a89e2da3a28f1c32b06e387201bc2d19791f622c673706b1"
+dependencies = [
+ "cast",
+ "itertools 0.10.5",
+]
+
+[[package]]
+name = "crossbeam-deque"
+version = "0.8.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9dd111b7b7f7d55b72c0a6ae361660ee5853c9af73f70c3c2ef6858b950e2e51"
+dependencies = [
+ "crossbeam-epoch",
+ "crossbeam-utils",
+]
+
+[[package]]
+name = "crossbeam-epoch"
+version = "0.9.18"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e"
+dependencies = [
+ "crossbeam-utils",
+]
+
+[[package]]
+name = "crossbeam-utils"
+version = "0.8.21"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28"
+
+[[package]]
+name = "crunchy"
+version = "0.2.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7a81dae078cea95a014a339291cec439d2f232ebe854a9d672b796c6afafa9b7"
+
 [[package]]
 name = "either"
 version = "1.13.0"
@@ -37,6 +189,33 @@ dependencies = [
 "wasi",
 ]

+[[package]]
+name = "half"
+version = "2.4.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6dd08c532ae367adf81c312a4580bc67f1d0fe8bc9c460520283f4c0ff277888"
+dependencies = [
+ "cfg-if",
+ "crunchy",
+]
+
+[[package]]
+name = "hermit-abi"
+version = "0.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fbf6a919d6cf397374f7dfeeea91d974c7c0a7221d0d0f4f20d859d329e53fcc"
+
+[[package]]
+name = "is-terminal"
+version = "0.4.13"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "261f68e344040fbd0edea105bef17c66edf46f984ddb1115b775ce31be948f4b"
+dependencies = [
+ "hermit-abi",
+ "libc",
+ "windows-sys 0.52.0",
+]
+
 [[package]]
 name = "itertools"
 version = "0.10.5"
@@ -46,22 +225,61 @@ dependencies = [
 "either",
 ]

+[[package]]
+name = "itertools"
+version = "0.13.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "413ee7dfc52ee1a4949ceeb7dbc8a33f2d6c088194d9f922fb8318faf1f01186"
+dependencies = [
+ "either",
+]
+
+[[package]]
+name = "itoa"
+version = "1.0.14"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d75a2a4b1b190afb6f5425f10f6a8f959d2ea0b9c2b1d79553551850539e4674"
+
+[[package]]
+name = "js-sys"
+version = "0.3.76"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6717b6b5b077764fb5966237269cb3c64edddde4b14ce42647430a78ced9e7b7"
+dependencies = [
+ "once_cell",
+ "wasm-bindgen",
+]
+
 [[package]]
 name = "libc"
 version = "0.2.167"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "09d6582e104315a817dff97f75133544b2e094ee22447d2acf4a74e189ba06fc"

+[[package]]
+name = "log"
+version = "0.4.22"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a7a70ba024b9dc04c27ea2f0c0548feb474ec5c54bba33a7f72f873a39d07b24"
+
 [[package]]
 name = "math"
 version = "0.1.0"
 dependencies = [
+ "criterion",
+ "itertools 0.13.0",
 "num-bigint",
 "num-traits",
 "primality-test",
 "prime_factorization",
 ]

+[[package]]
+name = "memchr"
+version = "2.7.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3"
+
 [[package]]
 name = "num"
 version = "0.4.3"
@@ -135,6 +353,46 @@ dependencies = [
 "autocfg",
 ]

+[[package]]
+name = "once_cell"
+version = "1.20.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1261fe7e33c73b354eab43b1273a57c8f967d0391e80353e51f764ac02cf6775"
+
+[[package]]
+name = "oorandom"
+version = "11.1.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b410bbe7e14ab526a0e86877eb47c6996a2bd7746f027ba551028c925390e4e9"
+
+[[package]]
+name = "plotters"
+version = "0.3.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5aeb6f403d7a4911efb1e33402027fc44f29b5bf6def3effcc22d7bb75f2b747"
+dependencies = [
+ "num-traits",
+ "plotters-backend",
+ "plotters-svg",
+ "wasm-bindgen",
+ "web-sys",
+]
+
+[[package]]
+name = "plotters-backend"
+version = "0.3.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "df42e13c12958a16b3f7f4386b9ab1f3e7933914ecea48da7139435263a4172a"
+
+[[package]]
+name = "plotters-svg"
+version = "0.3.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "51bae2ac328883f7acdfea3d66a7c35751187f870bc81f94563733a154d7a670"
+dependencies = [
+ "plotters-backend",
+]
+
 [[package]]
 name = "ppv-lite86"
 version = "0.2.20"
@@ -152,11 +410,11 @@ checksum = "98439e9658b9548a33abdab8c82532554dc08e49ddc5398a9262222fb360ae24"

 [[package]]
 name = "prime_factorization"
-version = "1.0.4"
+version = "1.0.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "61b43cd4d5e49fa3c769f72033129f07eeaa102c3db2aa11be0c7f1a0cb50f0c"
+checksum = "bb24cb4f70d64221509ab3dca82ad2ec24e1d7f3fa3e7cb9eed4ced578683287"
 dependencies = [
- "itertools",
+ "itertools 0.10.5",
 "num",
 "rand",
 ]
@@ -209,6 +467,102 @@ dependencies = [
 "getrandom",
 ]

+[[package]]
+name = "rayon"
+version = "1.10.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b418a60154510ca1a002a752ca9714984e21e4241e804d32555251faf8b78ffa"
+dependencies = [
+ "either",
+ "rayon-core",
+]
+
+[[package]]
+name = "rayon-core"
+version = "1.12.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1465873a3dfdaa8ae7cb14b4383657caab0b3e8a0aa9ae8e04b044854c8dfce2"
+dependencies = [
+ "crossbeam-deque",
+ "crossbeam-utils",
+]
+
+[[package]]
+name = "regex"
+version = "1.11.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b544ef1b4eac5dc2db33ea63606ae9ffcfac26c1416a2806ae0bf5f56b201191"
+dependencies = [
+ "aho-corasick",
+ "memchr",
+ "regex-automata",
+ "regex-syntax",
+]
+
+[[package]]
+name = "regex-automata"
+version = "0.4.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "809e8dc61f6de73b46c85f4c96486310fe304c434cfa43669d7b40f711150908"
+dependencies = [
+ "aho-corasick",
+ "memchr",
+ "regex-syntax",
+]
+
+[[package]]
+name = "regex-syntax"
+version = "0.8.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2b15c43186be67a4fd63bee50d0303afffcef381492ebe2c5d87f324e1b8815c"
+
+[[package]]
+name = "ryu"
+version = "1.0.18"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f3cb5ba0dc43242ce17de99c180e96db90b235b8a9fdc9543c96d2209116bd9f"
+
+[[package]]
+name = "same-file"
+version = "1.0.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "93fc1dc3aaa9bfed95e02e6eadabb4baf7e3078b0bd1b4d7b6b0b68378900502"
+dependencies = [
+ "winapi-util",
+]
+
+[[package]]
+name = "serde"
+version = "1.0.216"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0b9781016e935a97e8beecf0c933758c97a5520d32930e460142b4cd80c6338e"
+dependencies = [
+ "serde_derive",
+]
+
+[[package]]
+name = "serde_derive"
+version = "1.0.216"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "46f859dbbf73865c6627ed570e78961cd3ac92407a2d117204c49232485da55e"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
+[[package]]
+name = "serde_json"
+version = "1.0.133"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c7fceb2473b9166b2294ef05efcb65a3db80803f0b03ef86a5fc88a2b85ee377"
+dependencies = [
+ "itoa",
+ "memchr",
+ "ryu",
+ "serde",
+]
+
 [[package]]
 name = "syn"
 version = "2.0.90"
@@ -220,18 +574,193 @@ dependencies = [
 "unicode-ident",
 ]

+[[package]]
+name = "tinytemplate"
+version = "1.2.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "be4d6b5f19ff7664e8c98d03e2139cb510db9b0a60b55f8e8709b689d939b6bc"
+dependencies = [
+ "serde",
+ "serde_json",
+]
+
 [[package]]
 name = "unicode-ident"
 version = "1.0.14"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "adb9e6ca4f869e1180728b7950e35922a7fc6397f7b641499e8f3ef06e50dc83"

+[[package]]
+name = "walkdir"
+version = "2.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "29790946404f91d9c5d06f9874efddea1dc06c5efe94541a7d6863108e3a5e4b"
+dependencies = [
+ "same-file",
+ "winapi-util",
+]
+
 [[package]]
 name = "wasi"
 version = "0.11.0+wasi-snapshot-preview1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423"

+[[package]]
+name = "wasm-bindgen"
+version = "0.2.99"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a474f6281d1d70c17ae7aa6a613c87fce69a127e2624002df63dcb39d6cf6396"
+dependencies = [
+ "cfg-if",
+ "once_cell",
+ "wasm-bindgen-macro",
+]
+
+[[package]]
+name = "wasm-bindgen-backend"
+version = "0.2.99"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5f89bb38646b4f81674e8f5c3fb81b562be1fd936d84320f3264486418519c79"
+dependencies = [
+ "bumpalo",
+ "log",
+ "proc-macro2",
+ "quote",
+ "syn",
+ "wasm-bindgen-shared",
+]
+
+[[package]]
+name = "wasm-bindgen-macro"
+version = "0.2.99"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2cc6181fd9a7492eef6fef1f33961e3695e4579b9872a6f7c83aee556666d4fe"
+dependencies = [
+ "quote",
+ "wasm-bindgen-macro-support",
+]
+
+[[package]]
+name = "wasm-bindgen-macro-support"
+version = "0.2.99"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "30d7a95b763d3c45903ed6c81f156801839e5ee968bb07e534c44df0fcd330c2"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+ "wasm-bindgen-backend",
+ "wasm-bindgen-shared",
+]
+
+[[package]]
+name = "wasm-bindgen-shared"
+version = "0.2.99"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "943aab3fdaaa029a6e0271b35ea10b72b943135afe9bffca82384098ad0e06a6"
+
+[[package]]
+name = "web-sys"
+version = "0.3.76"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "04dd7223427d52553d3702c004d3b2fe07c148165faa56313cb00211e31c12bc"
+dependencies = [
+ "js-sys",
+ "wasm-bindgen",
+]
+
+[[package]]
+name = "winapi-util"
+version = "0.1.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "cf221c93e13a30d793f7645a0e7762c55d169dbb0a49671918a2319d289b10bb"
+dependencies = [
+ "windows-sys 0.59.0",
+]
+
+[[package]]
+name = "windows-sys"
+version = "0.52.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d"
+dependencies = [
+ "windows-targets",
+]
+
+[[package]]
+name = "windows-sys"
+version = "0.59.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b"
+dependencies = [
+ "windows-targets",
+]
+
+[[package]]
+name = "windows-targets"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973"
+dependencies = [
+ "windows_aarch64_gnullvm",
+ "windows_aarch64_msvc",
+ "windows_i686_gnu",
+ "windows_i686_gnullvm",
+ "windows_i686_msvc",
+ "windows_x86_64_gnu",
+ "windows_x86_64_gnullvm",
+ "windows_x86_64_msvc",
+]
+
+[[package]]
+name = "windows_aarch64_gnullvm"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3"
+
+[[package]]
+name = "windows_aarch64_msvc"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469"
+
+[[package]]
+name = "windows_i686_gnu"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b"
+
+[[package]]
+name = "windows_i686_gnullvm"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66"
+
+[[package]]
+name = "windows_i686_msvc"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66"
+
+[[package]]
+name = "windows_x86_64_gnu"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78"
+
+[[package]]
+name = "windows_x86_64_gnullvm"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d"
+
+[[package]]
+name = "windows_x86_64_msvc"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec"
+
 [[package]]
 name = "zerocopy"
 version = "0.7.35"
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -7,4 +7,10 @@ edition = "2021"
 primality-test = "0.3.0"
 num-bigint = "0.4.6"
 num-traits = "0.2.19"
-prime_factorization = "1.0.4"
+prime_factorization = "1.0.5"
+itertools = "0.13.0"
+criterion = "0.5.1"
+
+[[bench]]
+name = "ntt"
+harness = false
--- a/benches/ntt.rs
+++ b/benches/ntt.rs
@@ -0,0 +1,122 @@
+use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion};
+use math::{modulus::prime::Prime,dft::ntt::Table};
+use math::dft::DFT;
+
+fn forward_inplace(c: &mut Criterion) {
+    fn runner<T: DFT<u64> + 'static>(prime_instance: &mut Prime<u64>, nth_root: u64) -> Box<dyn FnMut()+ '_> {
+        let ntt_table: Table<'_, u64> = Table::<u64>::new(prime_instance, nth_root);
+        let mut a: Vec<u64> = vec![0; (nth_root >> 1) as usize];
+        for i in 0..a.len(){
+            a[i] = i as u64;
+        }
+        Box::new(move || {
+            ntt_table.forward_inplace(&mut a)
+        })
+    }
+
+    let mut b: criterion::BenchmarkGroup<'_, criterion::measurement::WallTime> = c.benchmark_group("forward_inplace");
+    for log_nth_root in 11..18 {
+
+        let mut prime_instance: Prime<u64> = Prime::<u64>::new(0x1fffffffffe00001, 1);
+
+        let runners = [
+            ("prime", {
+                runner::<Table<'_, u64>>(&mut prime_instance, 1<<log_nth_root)
+            }),
+        ];
+        for (name, mut runner) in runners {
+            let id = BenchmarkId::new(name, 1<<(log_nth_root-1));
+            b.bench_with_input(id, &(), |b, _| b.iter(&mut runner));
+        }
+    }
+}
+
+fn forward_inplace_lazy(c: &mut Criterion) {
+    fn runner<T: DFT<u64> + 'static>(prime_instance: &mut Prime<u64>, nth_root: u64) -> Box<dyn FnMut()+ '_> {
+        let ntt_table: Table<'_, u64> = Table::<u64>::new(prime_instance, nth_root);
+        let mut a: Vec<u64> = vec![0; (nth_root >> 1) as usize];
+        for i in 0..a.len(){
+            a[i] = i as u64;
+        }
+        Box::new(move || {
+            ntt_table.forward_inplace_lazy(&mut a)
+        })
+    }
+
+    let mut b: criterion::BenchmarkGroup<'_, criterion::measurement::WallTime> = c.benchmark_group("forward_inplace_lazy");
+    for log_nth_root in 11..17 {
+
+        let mut prime_instance: Prime<u64> = Prime::<u64>::new(0x1fffffffffe00001, 1);
+
+        let runners = [
+            ("prime", {
+                runner::<Table<'_, u64>>(&mut prime_instance, 1<<log_nth_root)
+            }),
+        ];
+        for (name, mut runner) in runners {
+            let id = BenchmarkId::new(name, 1<<(log_nth_root-1));
+            b.bench_with_input(id, &(), |b, _| b.iter(&mut runner));
+        }
+    }
+}
+
+fn backward_inplace(c: &mut Criterion) {
+    fn runner<T: DFT<u64> + 'static>(prime_instance: &mut Prime<u64>, nth_root: u64) -> Box<dyn FnMut()+ '_> {
+        let ntt_table: Table<'_, u64> = Table::<u64>::new(prime_instance, nth_root);
+        let mut a: Vec<u64> = vec![0; (nth_root >> 1) as usize];
+        for i in 0..a.len(){
+            a[i] = i as u64;
+        }
+        Box::new(move || {
+            ntt_table.backward_inplace(&mut a)
+        })
+    }
+
+    let mut b: criterion::BenchmarkGroup<'_, criterion::measurement::WallTime> = c.benchmark_group("backward_inplace");
+    for log_nth_root in 11..18 {
+
+        let mut prime_instance: Prime<u64> = Prime::<u64>::new(0x1fffffffffe00001, 1);
+
+        let runners = [
+            ("prime", {
+                runner::<Table<'_, u64>>(&mut prime_instance, 1<<log_nth_root)
+            }),
+        ];
+        for (name, mut runner) in runners {
+            let id = BenchmarkId::new(name, 1<<(log_nth_root-1));
+            b.bench_with_input(id, &(), |b, _| b.iter(&mut runner));
+        }
+    }
+}
+
+fn backward_inplace_lazy(c: &mut Criterion) {
+    fn runner<T: DFT<u64> + 'static>(prime_instance: &mut Prime<u64>, nth_root: u64) -> Box<dyn FnMut()+ '_> {
+        let ntt_table: Table<'_, u64> = Table::<u64>::new(prime_instance, nth_root);
+        let mut a: Vec<u64> = vec![0; (nth_root >> 1) as usize];
+        for i in 0..a.len(){
+            a[i] = i as u64;
+        }
+        Box::new(move || {
+            ntt_table.backward_inplace_lazy(&mut a)
+        })
+    }
+
+    let mut b: criterion::BenchmarkGroup<'_, criterion::measurement::WallTime> = c.benchmark_group("backward_inplace_lazy");
+    for log_nth_root in 11..17 {
+
+        let mut prime_instance: Prime<u64> = Prime::<u64>::new(0x1fffffffffe00001, 1);
+
+        let runners = [
+            ("prime", {
+                runner::<Table<'_, u64>>(&mut prime_instance, 1<<log_nth_root)
+            }),
+        ];
+        for (name, mut runner) in runners {
+            let id = BenchmarkId::new(name, 1<<(log_nth_root-1));
+            b.bench_with_input(id, &(), |b, _| b.iter(&mut runner));
+        }
+    }
+}
+
+criterion_group!(benches, forward_inplace, forward_inplace_lazy, backward_inplace, backward_inplace_lazy);
+criterion_main!(benches);
--- a/examples/main.rs
+++ b/examples/main.rs
@@ -4,8 +4,8 @@ use math::dft::ntt::Table;

 fn main() {
    // Example usage of `Prime<u64>`
-    let q_base: u64 = 65537;      // Example prime base
-    let q_power: u64 = 2;     // Example power
+    let q_base: u64 = 0x1fffffffffe00001;      // Example prime base
+    let q_power: u64 = 1;     // Example power
    let mut prime_instance: Prime<u64> = Prime::<u64>::new(q_base, q_power);
    
    // Display the fields of `Prime` to verify
@@ -19,4 +19,20 @@ fn main() {

    let ntt_table: Table<'_, u64> = Table::<u64>::new(&mut prime_instance, nth_root);

+    let mut a: Vec<u64> = vec![0; (nth_root >> 1) as usize];
+
+    for i in 0..a.len(){
+        a[i] = i as u64;
+    }
+
+    println!("{:?}", a);
+
+    ntt_table.forward_inplace(&mut a);
+
+    println!("{:?}", a);
+
+    ntt_table.backward_inplace(&mut a);
+
+    println!("{:?}", a);
+
 }
--- a/src/dft.rs
+++ b/src/dft.rs
@@ -1 +1,8 @@
-pub mod ntt;
+pub mod ntt;
+
+pub trait DFT<O> {
+    fn forward_inplace_lazy(&self, x: &mut [O]);
+    fn forward_inplace(&self, x: &mut [O]);
+    fn backward_inplace_lazy(&self, x: &mut [O]);
+    fn backward_inplace(&self, x: &mut [O]);
+}
--- a/src/dft/ntt.rs
+++ b/src/dft/ntt.rs
@@ -1,10 +1,18 @@
 use crate::modulus::montgomery::Montgomery;
+use crate::modulus::shoup::Shoup;
 use crate::modulus::prime::Prime;
+use crate::modulus::ReduceOnce;
+use crate::modulus::WordOps;
+use crate::dft::DFT;
+use itertools::izip;

 pub struct Table<'a, O>{
    prime:&'a Prime<O>,
-    pub psi_forward_rev:Vec<Montgomery<O>>,
-    psi_backward_rev: Vec<Montgomery<O>>,
+    psi_forward_rev:Vec<Shoup<u64>>,
+    psi_backward_rev: Vec<Shoup<u64>>,
+    q:O,
+    two_q:O,
+    four_q:O,
 }

 impl<'a> Table<'a, u64> {
@@ -17,27 +25,227 @@ impl<'a> Table<'a, u64> {
        let psi_mont: Montgomery<u64> = prime.montgomery.prepare(psi);
        let psi_inv_mont: Montgomery<u64> = prime.montgomery.pow(psi_mont, prime.phi-1);
        
-        let mut psi_forward_rev: Vec<Montgomery<u64>> = vec![Montgomery(0); (nth_root >> 1) as usize];
-        let mut psi_backward_rev: Vec<Montgomery<u64>> = vec![Montgomery(0); (nth_root >> 1) as usize];
+        let mut psi_forward_rev: Vec<Shoup<u64>> = vec![Shoup(0, 0); (nth_root >> 1) as usize];
+        let mut psi_backward_rev: Vec<Shoup<u64>> = vec![Shoup(0, 0); (nth_root >> 1) as usize];

-        psi_forward_rev[0] = prime.montgomery.one();
-        psi_backward_rev[0] = prime.montgomery.one();
+        psi_forward_rev[0] = prime.shoup.prepare(1);
+        psi_backward_rev[0] = prime.shoup.prepare(1);

-        let log_nth_root_half: u32 = usize::BITS - ((nth_root>>1 as usize)-1).leading_zeros();
+        let log_nth_root_half: u32 = (nth_root>>1).log2() as _;
+
+        let mut powers_forward: u64 = 1u64;
+        let mut powers_backward: u64 = 1u64;

        for i in 1..(nth_root>>1) as usize{

-            let i_rev_prev: usize = (i-1).reverse_bits() >> (usize::BITS - log_nth_root_half);
-            let i_rev_next: usize = i.reverse_bits() >> (usize::BITS - log_nth_root_half);
+            let i_rev: usize = i.reverse_bits_msb(log_nth_root_half);

-            psi_forward_rev[i_rev_next] = prime.montgomery.mul_internal(psi_forward_rev[i_rev_prev], psi_mont);
-            psi_backward_rev[i_rev_next] = prime.montgomery.mul_internal(psi_backward_rev[i_rev_prev], psi_inv_mont);
+            prime.montgomery.mul_external_assign(psi_mont, &mut powers_forward);
+            prime.montgomery.mul_external_assign(psi_inv_mont, &mut powers_backward);
+
+            psi_forward_rev[i_rev] = prime.shoup.prepare(powers_forward);
+            psi_backward_rev[i_rev] = prime.shoup.prepare(powers_backward); 
        }

        Self{ 
            prime: prime, 
            psi_forward_rev: psi_forward_rev, 
-            psi_backward_rev: psi_backward_rev, 
+            psi_backward_rev: psi_backward_rev,
+            q:prime.q(),
+            two_q:prime.q()<<1,
+            four_q:prime.q()<<2,
        }
    }
-}
+
+    // Returns n^-1 mod q in Montgomery.
+    fn inv(&self, n:u64) -> Montgomery<u64>{
+        self.prime.montgomery.pow(self.prime.montgomery.prepare(n), self.prime.phi-1)
+    }
+}
+
+
+impl<'a> DFT<u64> for Table<'a,u64>{
+    fn forward_inplace(&self, a: &mut [u64]){
+        self.forward_inplace(a)
+    }
+
+    fn forward_inplace_lazy(&self, a: &mut [u64]){
+        self.forward_inplace_lazy(a)
+    }
+
+    fn backward_inplace(&self, a: &mut [u64]){
+        self.backward_inplace(a)
+    }
+
+    fn backward_inplace_lazy(&self, a: &mut [u64]){
+        self.backward_inplace_lazy(a)
+    }
+}
+
+impl<'a> Table<'a,u64>{
+
+    pub fn forward_inplace_lazy(&self, a: &mut [u64]){
+        self.forward_inplace_core::<true>(a);
+    }
+
+    pub fn forward_inplace(&self, a: &mut [u64]){
+        self.forward_inplace_core::<false>(a);
+    }
+
+    pub fn forward_inplace_core<const LAZY: bool>(&self, a: &mut [u64]) {
+
+        let n: usize = a.len();
+        assert!(n & n-1 == 0, "invalid x.len()= {} must be a power of two", n);
+        let log_n: u32 = usize::BITS - ((n as usize)-1).leading_zeros();
+
+        for layer in 0..log_n {
+            let (m, size) = (1 << layer, 1 << (log_n - layer - 1));
+            let t: usize = 2*size;
+            if layer == log_n - 1 {
+                if LAZY{
+                    izip!(a.chunks_exact_mut(t), &self.psi_forward_rev[m..]).for_each(|(a, psi)| {
+                        let (a, b) = a.split_at_mut(size);
+                        self.dit::<false>(&mut a[0], &mut b[0], *psi);
+                        debug_assert!(a[0] < self.two_q, "forward_inplace_core::<LAZY=true> output {} > {} (2q-1)", a[0], self.two_q-1);
+                        debug_assert!(b[0] < self.two_q, "forward_inplace_core::<LAZY=true> output {} > {} (2q-1)", b[0], self.two_q-1);
+                    });
+                }else{
+                    izip!(a.chunks_exact_mut(t), &self.psi_forward_rev[m..]).for_each(|(a, psi)| {
+                        let (a, b) = a.split_at_mut(size);
+                        self.dit::<true>(&mut a[0], &mut b[0], *psi);
+                        self.prime.shoup.reduce_assign(&mut a[0]);
+                        self.prime.shoup.reduce_assign(&mut b[0]);
+                        debug_assert!(a[0] < self.q, "forward_inplace_core::<LAZY=false> output {} > {} (q-1)", a[0], self.q-1);
+                        debug_assert!(b[0] < self.q, "forward_inplace_core::<LAZY=false> output {} > {} (q-1)", b[0], self.q-1);
+                    });
+                }
+                
+            } else if t >= 16{
+                izip!(a.chunks_exact_mut(t), &self.psi_forward_rev[m..]).for_each(|(a, psi)| {
+                    let (a, b) = a.split_at_mut(size);
+                    izip!(a.chunks_exact_mut(8), b.chunks_exact_mut(8)).for_each(|(a, b)| {
+                        self.dit::<true>(&mut a[0], &mut b[0], *psi);
+                        self.dit::<true>(&mut a[1], &mut b[1], *psi);
+                        self.dit::<true>(&mut a[2], &mut b[2], *psi);
+                        self.dit::<true>(&mut a[3], &mut b[3], *psi);
+                        self.dit::<true>(&mut a[4], &mut b[4], *psi);
+                        self.dit::<true>(&mut a[5], &mut b[5], *psi);
+                        self.dit::<true>(&mut a[6], &mut b[6], *psi);
+                        self.dit::<true>(&mut a[7], &mut b[7], *psi);
+                    });
+                });
+            }else{
+                izip!(a.chunks_exact_mut(t), &self.psi_forward_rev[m..]).for_each(|(a, psi)| {
+                    let (a, b) = a.split_at_mut(size);
+                    izip!(a, b).for_each(|(a, b)| self.dit::<true>(a, b, *psi));
+                });
+            }
+        }
+    }
+
+    #[inline(always)]
+    fn dit<const LAZY: bool>(&self, a: &mut u64, b: &mut u64, t: Shoup<u64>) {
+        debug_assert!(*a < self.four_q, "a:{} q:{}", a, self.four_q);
+        debug_assert!(*b < self.four_q, "b:{} q:{}", b, self.four_q);
+        a.reduce_once_assign(self.two_q);
+        let bt: u64 = self.prime.shoup.mul_external_lazy(t, *b);
+        *b = a.wrapping_add(self.two_q-bt);
+        *a = a.wrapping_add(bt);
+        if !LAZY {
+            a.reduce_once_assign(self.two_q);
+            b.reduce_once_assign(self.two_q);
+        }
+    }
+
+    pub fn backward_inplace_lazy(&self, a: &mut [u64]){
+        self.backward_inplace_core::<true>(a);
+    }
+
+    pub fn backward_inplace(&self, a: &mut [u64]){
+        self.backward_inplace_core::<false>(a);
+    }
+
+    pub fn backward_inplace_core<const LAZY:bool>(&self, a: &mut [u64]) {
+        let n: usize = a.len();
+        assert!(n & n-1 == 0, "invalid x.len()= {} must be a power of two", n);
+        let log_n = usize::BITS - ((n as usize)-1).leading_zeros();
+
+        for layer in (0..log_n).rev() {
+            let (m, size) = (1 << layer, 1 << (log_n - layer - 1));
+            let t: usize = 2*size;
+            if layer == 0 {
+
+                let n_inv: Shoup<u64> = self.prime.shoup.prepare(self.prime.inv(n as u64));
+                let psi: Shoup<u64> = self.prime.shoup.prepare(self.prime.shoup.mul_external(n_inv, self.psi_backward_rev[1].0));
+
+                izip!(a.chunks_exact_mut(2 * size)).for_each(
+                    |a| {
+                        let (a, b) = a.split_at_mut(size);
+                        izip!(a.chunks_exact_mut(8), b.chunks_exact_mut(8)).for_each(|(a, b)| {
+                            self.dif_last::<LAZY>(&mut a[0], &mut b[0], psi, n_inv);
+                            self.dif_last::<LAZY>(&mut a[1], &mut b[1], psi, n_inv);
+                            self.dif_last::<LAZY>(&mut a[2], &mut b[2], psi, n_inv);
+                            self.dif_last::<LAZY>(&mut a[3], &mut b[3], psi, n_inv);
+                            self.dif_last::<LAZY>(&mut a[4], &mut b[4], psi, n_inv);
+                            self.dif_last::<LAZY>(&mut a[5], &mut b[5], psi, n_inv);
+                            self.dif_last::<LAZY>(&mut a[6], &mut b[6], psi, n_inv);
+                            self.dif_last::<LAZY>(&mut a[7], &mut b[7], psi, n_inv);
+                        });
+                    },
+                );
+
+            } else if t >= 16{
+                izip!(a.chunks_exact_mut(t), &self.psi_backward_rev[m..]).for_each(
+                    |(a, psi)| {
+                        let (a, b) = a.split_at_mut(size);
+                        izip!(a.chunks_exact_mut(8), b.chunks_exact_mut(8)).for_each(|(a, b)| {
+                            self.dif::<true>(&mut a[0], &mut b[0], *psi);
+                            self.dif::<true>(&mut a[1], &mut b[1], *psi);
+                            self.dif::<true>(&mut a[2], &mut b[2], *psi);
+                            self.dif::<true>(&mut a[3], &mut b[3], *psi);
+                            self.dif::<true>(&mut a[4], &mut b[4], *psi);
+                            self.dif::<true>(&mut a[5], &mut b[5], *psi);
+                            self.dif::<true>(&mut a[6], &mut b[6], *psi);
+                            self.dif::<true>(&mut a[7], &mut b[7], *psi);
+                        });
+                    },
+                );
+            } else {
+                izip!(a.chunks_exact_mut(2 * size), &self.psi_backward_rev[m..]).for_each(
+                    |(a, psi)| {
+                        let (a, b) = a.split_at_mut(size);
+                        izip!(a, b).for_each(|(a, b)| self.dif::<true>(a, b, *psi));
+                    },
+                );
+            }
+        }
+    }
+
+    #[inline(always)]
+    fn dif<const LAZY: bool>(&self, a: &mut u64, b: &mut u64, t: Shoup<u64>) {
+        debug_assert!(*a < self.two_q);
+        debug_assert!(*b < self.two_q);
+        let d: u64 = self.prime.shoup.mul_external_lazy(t, *a + self.two_q - *b);
+        *a = a.wrapping_add(*b);
+        a.reduce_once_assign(self.two_q);
+        *b = d;
+        if !LAZY {
+            a.reduce_once_assign(self.q);
+            b.reduce_once_assign(self.q);
+        }
+    }
+
+    fn dif_last<const LAZY:bool>(&self, a: &mut u64, b: &mut u64, psi: Shoup<u64>, n_inv: Shoup<u64>){
+        debug_assert!(*a < self.two_q);
+        debug_assert!(*b < self.two_q);
+        if LAZY{
+            let d: u64 = self.prime.shoup.mul_external_lazy(psi, *a + self.two_q - *b);
+            *a = self.prime.shoup.mul_external_lazy(n_inv, *a + *b);
+            *b = d;
+        }else{
+            let d: u64 = self.prime.shoup.mul_external(psi, *a + self.two_q - *b);
+            *a = self.prime.shoup.mul_external(n_inv, *a + *b);
+            *b = d;
+        }
+    }
+}
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -2,4 +2,6 @@
 #![feature(test)]

 pub mod modulus;
-pub mod dft;
+pub mod dft;
+pub mod ring;
+pub mod poly;
--- a/src/modulus.rs
+++ b/src/modulus.rs
@@ -1,25 +1,73 @@
 pub mod prime;
 pub mod barrett;
 pub mod montgomery;
+pub mod shoup;
+pub mod impl_u64;

-trait ReduceOnce<O>{
+pub trait WordOps<O>{
+    fn log2(self) -> O;
+    fn reverse_bits_msb(self, n:u32) -> O;
+}
+
+impl WordOps<u64> for u64{
+    #[inline(always)]
+    fn log2(self) -> u64{
+        (u64::BITS - (self-1).leading_zeros()) as _
+    }
+    #[inline(always)]
+    fn reverse_bits_msb(self, n: u32) -> u64{
+        self.reverse_bits() >> (usize::BITS - n)
+    }
+}
+
+impl WordOps<usize> for usize{
+    #[inline(always)]
+    fn log2(self) -> usize{
+        (usize::BITS - (self-1).leading_zeros()) as _
+    }
+    #[inline(always)]
+    fn reverse_bits_msb(self, n: u32) -> usize{
+        self.reverse_bits() >> (usize::BITS - n)
+    }
+}
+
+pub trait ReduceOnce<O>{
+    /// Assigns self-q to self if self >= q in constant time.
+    /// User must ensure that 2q fits in O.
+    fn reduce_once_constant_time_assign(&mut self, q: O);
+    /// Returns self-q if self >= q else self in constant time.
+    /// /// User must ensure that 2q fits in O.
+    fn reduce_once_constant_time(&self, q:O) -> O;
+    /// Assigns self-q to self if self >= q.
+    /// /// User must ensure that 2q fits in O.
    fn reduce_once_assign(&mut self, q: O);
+    /// Returns self-q if self >= q else self.
+    /// /// User must ensure that 2q fits in O.
    fn reduce_once(&self, q:O) -> O;
 }

 impl ReduceOnce<u64> for u64{
    #[inline(always)]
-    fn reduce_once_assign(&mut self, q: u64){
-        if *self >= q{
-            *self -= q
-        }
+    fn reduce_once_constant_time_assign(&mut self, q: u64){
+        debug_assert!(q < 0x8000000000000000, "2q >= 2^64");
+        *self -= (q.wrapping_sub(*self)>>63)*q;
    }
+    
+    #[inline(always)]
+    fn reduce_once_constant_time(&self, q:u64) -> u64{
+        debug_assert!(q < 0x8000000000000000, "2q >= 2^64");
+        self - (q.wrapping_sub(*self)>>63)*q
+    }
+
+    #[inline(always)]
+    fn reduce_once_assign(&mut self, q: u64){
+        debug_assert!(q < 0x8000000000000000, "2q >= 2^64");
+        *self = (*self).min(self.wrapping_sub(q))
+    }
+
    #[inline(always)]
    fn reduce_once(&self, q:u64) -> u64{
-        if *self >= q {
-            *self - q
-        } else {
-            *self
-        }
+        debug_assert!(q < 0x8000000000000000, "2q >= 2^64");
+        (*self).min(self.wrapping_sub(q))
    }
 }
--- a/src/modulus/barrett.rs
+++ b/src/modulus/barrett.rs
@@ -1,13 +1,8 @@
-use crate::modulus::ReduceOnce;
-
-use num_bigint::BigUint;
-use num_traits::cast::ToPrimitive;
-
 #[derive(Clone, Copy, Debug, PartialEq, Eq)]
 pub struct BarrettPrecomp<O>{
-    q: O,
-    lo:O,
-    hi:O,
+    pub q: O,
+    pub lo:O,
+    pub hi:O,
 }

 impl<O> BarrettPrecomp<O>{
@@ -23,26 +18,3 @@ impl<O> BarrettPrecomp<O>{
    }
 }

-impl BarrettPrecomp<u64>{
-    pub fn new(q: u64) -> BarrettPrecomp<u64> {
-        let big_r: BigUint = (BigUint::from(1 as usize)<<((u64::BITS<<1) as usize)) / BigUint::from(q);
-        let lo: u64 = (&big_r & BigUint::from(u64::MAX)).to_u64().unwrap();
-        let hi: u64 = (big_r >> u64::BITS).to_u64().unwrap();
-        Self{q, lo, hi}
-    }
-
-    /// Returns lhs mod q.
-    #[inline(always)]
-    pub fn reduce(&self, lhs: u64) -> u64{
-        let mut r: u64 = self.reduce_lazy(lhs);
-        r.reduce_once_assign(self.q);
-        r
-    }
-
-    /// Returns lhs mod q in range [0, 2q-1].
-    #[inline(always)]
-    pub fn reduce_lazy(&self, lhs: u64) -> u64{
-        let (_, mhi) = lhs.widening_mul(self.hi);
-        lhs - mhi.wrapping_mul(self.q)
-    }
-}
--- a/src/modulus/impl_u64/barrett.rs
+++ b/src/modulus/impl_u64/barrett.rs
@@ -0,0 +1,43 @@
+use crate::modulus::barrett::BarrettPrecomp;
+use crate::modulus::ReduceOnce;
+
+use num_bigint::BigUint;
+use num_traits::cast::ToPrimitive;
+
+impl BarrettPrecomp<u64>{
+    pub fn new(q: u64) -> BarrettPrecomp<u64> {
+        let big_r: BigUint = (BigUint::from(1 as usize)<<((u64::BITS<<1) as usize)) / BigUint::from(q);
+        let lo: u64 = (&big_r & BigUint::from(u64::MAX)).to_u64().unwrap();
+        let hi: u64 = (big_r >> u64::BITS).to_u64().unwrap();
+        Self{q, lo, hi}
+    }
+
+    /// Returns lhs mod q.
+    #[inline(always)]
+    pub fn reduce(&self, lhs: u64) -> u64{
+        let mut r: u64 = self.reduce_lazy(lhs);
+        r.reduce_once_assign(self.q);
+        r
+    }
+
+    /// Returns lhs mod q in range [0, 2q-1].
+    #[inline(always)]
+    pub fn reduce_lazy(&self, lhs: u64) -> u64{
+        let (_, mhi) = lhs.widening_mul(self.hi);
+        lhs - mhi.wrapping_mul(self.q)
+    }
+
+    /// Assigns lhs mod q to lhs.
+    #[inline(always)]
+    pub fn reduce_assign(&self, lhs: &mut u64){
+        self.reduce_lazy_assign(lhs);
+        lhs.reduce_once_assign(self.q);
+    }
+
+    /// Assigns lhs mod q in range [0, 2q-1] to lhs.
+    #[inline(always)]
+    pub fn reduce_lazy_assign(&self, lhs: &mut u64){
+        let (_, mhi) = lhs.widening_mul(self.hi);
+        *lhs = *lhs - mhi.wrapping_mul(self.q)
+    }
+}
--- a/src/modulus/impl_u64/generation.rs
+++ b/src/modulus/impl_u64/generation.rs
--- a/src/modulus/impl_u64/mod.rs
+++ b/src/modulus/impl_u64/mod.rs
@@ -0,0 +1,4 @@
+pub mod prime;
+pub mod barrett;
+pub mod montgomery;
+pub mod shoup;
--- a/src/modulus/impl_u64/montgomery.rs
+++ b/src/modulus/impl_u64/montgomery.rs
@@ -0,0 +1,267 @@
+
+use crate::modulus::ReduceOnce;
+use crate::modulus::montgomery::{MontgomeryPrecomp, Montgomery};
+use crate::modulus::barrett::{BarrettPrecomp};
+extern crate test;
+
+/// MontgomeryPrecomp is a set of methods implemented for MontgomeryPrecomp<u64>
+/// enabling Montgomery arithmetic over u64 values.
+impl MontgomeryPrecomp<u64>{
+
+    /// Returns an new instance of MontgomeryPrecomp<u64>.
+    /// This method will fail if gcd(q, 2^64) != 1.
+    #[inline(always)]
+    pub fn new(q: u64) -> MontgomeryPrecomp<u64>{
+        assert!(q & 1 != 0, "Invalid argument: gcd(q={}, radix=2^64) != 1", q);
+        let mut q_inv: u64 = 1;
+        let mut q_pow = q;
+        for _i in 0..63{
+            q_inv = q_inv.wrapping_mul(q_pow);
+            q_pow = q_pow.wrapping_mul(q_pow);
+        }
+        let mut precomp = Self{ 
+            q: q, 
+            barrett: BarrettPrecomp::new(q), 
+            q_inv: q_inv,
+            one: Montgomery(0),
+            minus_one: Montgomery(0),
+        };
+
+        precomp.one = precomp.prepare(1);
+        precomp.minus_one = Montgomery(q-precomp.one.value());
+
+        precomp
+    }
+
+    /// Returns 2^64 mod q as a Montgomery<u64>.
+    #[inline(always)]
+    pub fn one(&self) -> Montgomery<u64>{
+        self.one
+    }
+
+    /// Returns (q-1) * 2^64 mod q as a Montgomery<u64>.
+    #[inline(always)]
+    pub fn minus_one(&self) -> Montgomery<u64>{
+        self.minus_one
+    }
+
+    /// Returns lhs * 2^64 mod q as a Montgomery<u64>.
+    #[inline(always)]
+    pub fn prepare(&self, lhs: u64) -> Montgomery<u64>{
+        let mut rhs = Montgomery(0);
+        self.prepare_assign(lhs, &mut rhs);
+        rhs
+    }
+
+    /// Assigns lhs * 2^64 mod q to rhs.
+    #[inline(always)]
+    pub fn prepare_assign(&self, lhs: u64, rhs: &mut Montgomery<u64>){
+        self.prepare_lazy_assign(lhs, rhs);
+        rhs.value_mut().reduce_once_assign(self.q);
+    }
+
+    /// Returns lhs * 2^64 mod q in range [0, 2q-1] as a Montgomery<u64>.
+    #[inline(always)]
+    pub fn prepare_lazy(&self, lhs: u64) -> Montgomery<u64>{
+        let mut rhs = Montgomery(0);
+        self.prepare_lazy_assign(lhs, &mut rhs);
+        rhs
+    }
+
+    /// Assigns lhs * 2^64 mod q in range [0, 2q-1] to rhs.
+    #[inline(always)]
+    pub fn prepare_lazy_assign(&self, lhs: u64, rhs: &mut Montgomery<u64>){
+        let (_, mhi) = lhs.widening_mul(*self.barrett.value_lo());
+        *rhs = Montgomery((lhs.wrapping_mul(*self.barrett.value_hi()).wrapping_add(mhi)).wrapping_mul(self.q).wrapping_neg());
+    }
+
+    /// Returns lhs * (2^64)^-1 mod q as a u64.
+    #[inline(always)]
+    pub fn unprepare(&self, lhs: Montgomery<u64>) -> u64{
+        let mut rhs = 0u64;
+        self.unprepare_assign(lhs, &mut rhs);
+        rhs
+    }
+
+    /// Assigns lhs * (2^64)^-1 mod q to rhs.
+    #[inline(always)]
+    pub fn unprepare_assign(&self, lhs: Montgomery<u64>, rhs: &mut u64){
+        self.unprepare_lazy_assign(lhs, rhs);
+        rhs.reduce_once_assign(self.q);
+        
+    }
+
+    /// Returns lhs * (2^64)^-1 mod q in range [0, 2q-1].
+    #[inline(always)]
+    pub fn unprepare_lazy(&self, lhs: Montgomery<u64>) -> u64{
+        let mut rhs = 0u64;
+        self.unprepare_lazy_assign(lhs, &mut rhs);
+        rhs
+    }
+
+    /// Assigns lhs * (2^64)^-1 mod q in range [0, 2q-1] to rhs.
+    #[inline(always)]
+    pub fn unprepare_lazy_assign(&self, lhs: Montgomery<u64>, rhs: &mut u64){
+        let (_, r) = self.q.widening_mul(lhs.value().wrapping_mul(self.q_inv));
+        *rhs = self.q - r
+    }
+
+    /// Returns lhs * rhs * (2^{64})^-1 mod q.
+    #[inline(always)]
+    pub fn mul_external(&self, lhs: Montgomery<u64>, rhs: u64) -> u64{
+        let mut r = self.mul_external_lazy(lhs, rhs);
+        r.reduce_once_assign(self.q);
+        r
+    }
+
+    /// Assigns lhs * rhs * (2^{64})^-1 mod q to rhs.
+    #[inline(always)]
+    pub fn mul_external_assign(&self, lhs: Montgomery<u64>, rhs: &mut u64){
+        self.mul_external_lazy_assign(lhs, rhs);
+        rhs.reduce_once_assign(self.q);
+    }
+
+    /// Returns lhs * rhs * (2^{64})^-1 mod q in range [0, 2q-1].
+    #[inline(always)]
+    pub fn mul_external_lazy(&self, lhs: Montgomery<u64>, rhs: u64) -> u64{
+        let mut result: u64 = rhs;
+        self.mul_external_lazy_assign(lhs, &mut result);
+        result
+    }
+
+    /// Assigns lhs * rhs * (2^{64})^-1 mod q in range [0, 2q-1] to rhs.
+    #[inline(always)]
+    pub fn mul_external_lazy_assign(&self, lhs: Montgomery<u64>, rhs: &mut u64){
+        let (mlo, mhi) = lhs.value().widening_mul(*rhs);
+        let (_, hhi) = self.q.widening_mul(mlo.wrapping_mul(self.q_inv));
+        *rhs = mhi.wrapping_add(self.q - hhi)
+    }
+
+    /// Returns lhs * rhs * (2^{64})^-1 mod q in range [0, 2q-1].
+    #[inline(always)]
+    pub fn mul_internal(&self, lhs: Montgomery<u64>, rhs: Montgomery<u64>) -> Montgomery<u64>{
+        Montgomery(self.mul_external(lhs, *rhs.value()))
+    }
+
+    /// Assigns lhs * rhs * (2^{64})^-1 mod q to rhs.
+    #[inline(always)]
+    pub fn mul_internal_assign(&self, lhs: Montgomery<u64>, rhs: &mut Montgomery<u64>){
+        self.mul_external_assign(lhs, rhs.value_mut());
+    }
+
+    /// Returns lhs * rhs * (2^{64})^-1 mod q in range [0, 2q-1].
+    #[inline(always)]
+    pub fn mul_internal_lazy(&self, lhs: Montgomery<u64>, rhs: Montgomery<u64>) -> Montgomery<u64>{
+        Montgomery(self.mul_external_lazy(lhs, *rhs.value()))
+    }
+
+    /// Assigns lhs * rhs * (2^{64})^-1 mod q in range [0, 2q-1] to rhs.
+    #[inline(always)]
+    pub fn mul_internal_lazy_assign(&self, lhs: Montgomery<u64>, rhs: &mut Montgomery<u64>){
+        self.mul_external_lazy_assign(lhs, rhs.value_mut());
+    }
+
+    #[inline(always)]
+    pub fn add_internal(&self, lhs: Montgomery<u64>, rhs: Montgomery<u64>) -> Montgomery<u64>{
+        Montgomery(self.barrett.reduce(rhs.value() + lhs.value()))
+    }
+
+    /// Assigns lhs + rhs to rhs.
+    #[inline(always)]
+    pub fn add_internal_lazy_assign(&self, lhs: Montgomery<u64>, rhs: &mut Montgomery<u64>){
+        *rhs.value_mut() += lhs.value()
+    }
+
+    /// Assigns lhs + rhs - q if (lhs + rhs) >= q to rhs.
+    #[inline(always)]
+    pub fn add_internal_reduce_once_assign(&self, lhs: Montgomery<u64>, rhs: &mut Montgomery<u64>){
+        self.add_internal_lazy_assign(lhs, rhs);
+        rhs.value_mut().reduce_once_assign(self.q);
+    }
+
+    #[inline(always)]
+    pub fn reduce(&self, lhs: u64) -> u64{
+        self.barrett.reduce(lhs)
+    }
+
+    /// Returns lhs mod q in range [0, 2q-1].
+    #[inline(always)]
+    pub fn reduce_lazy(&self, lhs: u64) -> u64{
+        self.barrett.reduce_lazy(lhs)
+    }
+
+    #[inline(always)]
+    pub fn reduce_assign(&self, lhs: &mut u64){
+        self.barrett.reduce_assign(lhs)
+    }
+
+    /// Returns lhs mod q in range [0, 2q-1].
+    #[inline(always)]
+    pub fn reduce_lazy_assign(&self, lhs: &mut u64){
+        self.barrett.reduce_lazy_assign(lhs)
+    }
+
+    /// Returns (x^exponent) * 2^64 mod q.
+    #[inline(always)]
+    pub fn pow(&self, x: Montgomery<u64>, exponent:u64) -> Montgomery<u64>{
+        let mut y: Montgomery<u64> = self.one();
+        let mut x_mut: Montgomery<u64> = x;
+        let mut i: u64 = exponent;
+        while i > 0{
+            if i & 1 == 1{
+                self.mul_internal_assign(x_mut, &mut y);
+            }
+            self.mul_internal_assign(x_mut, &mut x_mut);
+            i >>= 1;
+        }
+
+        y.value_mut().reduce_once_assign(self.q);
+        y
+    }
+}
+
+/// Returns x^exponent mod q.
+/// This function internally instantiate a new MontgomeryPrecomp<u64>
+/// To be used when called only a few times and if there 
+/// is no Prime instantiated with q.
+fn pow(x:u64, exponent:u64, q:u64) -> u64{
+    let montgomery: MontgomeryPrecomp<u64> = MontgomeryPrecomp::<u64>::new(q);
+    let mut y_mont: Montgomery<u64> = montgomery.one();
+    let mut x_mont: Montgomery<u64> = montgomery.prepare(x);
+    while exponent > 0{
+        if exponent & 1 == 1{
+            montgomery.mul_internal_assign(x_mont, &mut y_mont);
+        }
+
+        montgomery.mul_internal_assign(x_mont, &mut x_mont);
+    }
+    
+    montgomery.unprepare(y_mont)
+}
+
+#[cfg(test)]
+mod tests {
+    use crate::modulus::montgomery;
+    use super::*;
+    use test::Bencher;
+
+    #[test]
+    fn test_mul_external() {
+        let q: u64 = 0x1fffffffffe00001;
+    	let m_precomp = montgomery::MontgomeryPrecomp::new(q);
+        let x: u64 = 0x5f876e514845cc8b;
+        let y: u64 = 0xad726f98f24a761a;
+        let y_mont = m_precomp.prepare(y);
+    	assert!(m_precomp.mul_external(y_mont, x) == (x as u128 * y as u128 % q as u128) as u64);
+    }
+
+    #[bench]
+    fn bench_mul_external(b: &mut Bencher){
+        let q: u64 = 0x1fffffffffe00001;
+    	let m_precomp = montgomery::MontgomeryPrecomp::new(q);
+        let mut x: u64 = 0x5f876e514845cc8b;
+        let y: u64 = 0xad726f98f24a761a;
+        let y_mont = m_precomp.prepare(y);
+        b.iter(|| m_precomp.mul_external_assign(y_mont, &mut x));
+    }
+}
--- a/src/modulus/impl_u64/prime.rs
+++ b/src/modulus/impl_u64/prime.rs
@@ -0,0 +1,214 @@
+use crate::modulus::prime::Prime;
+use crate::modulus::montgomery::{Montgomery, MontgomeryPrecomp};
+use crate::modulus::shoup::{ShoupPrecomp};
+use primality_test::is_prime;
+use prime_factorization::Factorization;
+
+impl Prime<u64>{
+
+    /// Returns a new instance of Prime<u64>.
+    /// Panics if q_base is not a prime > 2 and
+    /// if q_base^q_power would overflow u64.
+    pub fn new(q_base: u64, q_power: u64) -> Self{
+		assert!(is_prime(q_base) && q_base > 2);
+        Self::new_unchecked(q_base, q_power)
+	}
+
+    /// Returns a new instance of Prime<u64>.
+    /// Does not check if q_base is a prime > 2.
+    /// Panics if q_base^q_power would overflow u64.
+	pub fn new_unchecked(q_base: u64, q_power: u64) -> Self {
+
+        let mut q = q_base;
+        for _i in 1..q_power{
+            q *= q_base
+        }
+
+        assert!(q.next_power_of_two().ilog2() <= 61);
+
+        let mut phi = q_base-1;
+        for _i in 1..q_power{
+            phi *= q_base
+        }
+
+        Self {
+            q:q,
+            q_base:q_base,
+            q_power:q_power,
+            factors: Vec::new(),
+            montgomery:MontgomeryPrecomp::new(q),
+            shoup:ShoupPrecomp::new(q),
+            phi:phi,
+        }
+    }
+
+    pub fn q(&self) -> u64{
+        self.q
+    }
+
+    pub fn q_base(&self) -> u64{
+        self.q_base
+    }
+
+    pub fn q_power(&self) -> u64{
+        self.q_power
+    }
+
+    /// Returns x^exponen mod q.
+    #[inline(always)]
+    pub fn pow(&self, x: u64, exponent: u64) -> u64{
+        let mut y_mont: Montgomery<u64> = self.montgomery.one();
+        let mut x_mont: Montgomery<u64> = self.montgomery.prepare(x);
+        let mut i: u64 = exponent;
+        while i > 0{
+            if i & 1 == 1{
+                self.montgomery.mul_internal_assign(x_mont, &mut y_mont);
+            }
+
+            self.montgomery.mul_internal_assign(x_mont, &mut x_mont);
+
+            i >>= 1;
+
+        }
+        
+        self.montgomery.unprepare(y_mont)
+    }
+
+    /// Returns x^-1 mod q.
+    /// User must ensure that x is not divisible by q_base.
+    #[inline(always)]
+    pub fn inv(&self, x: u64) -> u64{
+        self.pow(x, self.phi-1)
+    }
+}
+
+impl Prime<u64>{
+    /// Returns the smallest nth primitive root of q_base.
+    pub fn primitive_root(&mut self) -> u64{
+
+        self.check_factors();
+
+        let mut candidate: u64 = 1u64;
+        let mut not_found: bool = true;
+
+        while not_found{
+
+            candidate += 1;
+
+            for &factor in &self.factors{
+
+                if  Pow(candidate, (self.q_base-1)/factor, self.q_base) == 1{
+                    not_found = true;
+                    break
+                }
+                not_found = false;
+            }
+        }
+
+        if not_found{
+            panic!("failed to find a primitive root for q_base={}", self.q_base)
+        }
+
+        candidate
+    }
+
+    /// Returns an nth primitive root of q = q_base^q_power in Montgomery.
+    pub fn primitive_nth_root(&mut self, nth_root:u64) -> u64{
+
+        assert!(self.q & (nth_root-1) == 1, "invalid prime: q = {} % nth_root = {} = {} != 1", self.q, nth_root, self.q & (nth_root-1));
+
+        let psi: u64 = self.primitive_root();
+
+        // nth primitive root mod q_base: psi_nth^(prime.q_base-1)/nth_root mod q_base
+        let psi_nth_q_base: u64 = Pow(psi, (self.q_base-1)/nth_root, self.q_base);
+
+        // lifts nth primitive root mod q_base to q = q_base^q_power
+        let psi_nth_q: u64 = self.hensel_lift(psi_nth_q_base, nth_root);
+
+        assert!(self.pow(psi_nth_q, nth_root) == 1, "invalid nth primitive root: psi^nth_root != 1 mod q");
+        assert!(self.pow(psi_nth_q, nth_root>>1) == self.q-1, "invalid nth primitive root: psi^(nth_root/2) != -1 mod q");
+
+        psi_nth_q
+    }
+
+    /// Checks if the field self.factor is populated.
+    /// If not, factorize q_base-1 and populates self.factor.
+    /// If yes, checks that it contains the unique factors of q_base-1.
+    pub fn check_factors(&mut self){
+
+        if self.factors.len() == 0{
+
+            let factors = Factorization::run(self.q_base-1).prime_factor_repr();
+            let mut distincts_factors: Vec<u64> = Vec::with_capacity(factors.len());
+            for factor in factors.iter(){
+                distincts_factors.push(factor.0)
+            }
+            self.factors = distincts_factors
+            
+        }else{
+            let mut q_base: u64 = self.q_base;
+
+            for &factor in &self.factors{
+                if !is_prime(factor){
+                    panic!("invalid factor list: factor {} is not prime", factor)
+                }
+    
+                while q_base%factor != 0{
+                    q_base /= factor
+                }
+            }
+    
+            if q_base != 1{
+                panic!("invalid factor list: does not fully divide q_base: q_base % (all factors) = {}", q_base)
+            }
+        }
+
+        
+    }
+
+    /// Returns (psi + a * q_base)^{nth_root} = 1 mod q = q_base^q_power given psi^{nth_root} = 1 mod q_base.
+    /// Panics if psi^{nth_root} != 1 mod q_base.
+    fn hensel_lift(&self, psi: u64, nth_root: u64) -> u64{
+        assert!(Pow(psi, nth_root, self.q_base)==1, "invalid argument psi: psi^nth_root = {} != 1", Pow(psi, nth_root, self.q_base));
+
+        let mut psi_mont: Montgomery<u64> = self.montgomery.prepare(psi);
+        let nth_root_mont: Montgomery<u64> = self.montgomery.prepare(nth_root);
+
+        for _i in 1..self.q_power{
+
+            let psi_pow: Montgomery<u64> = self.montgomery.pow(psi_mont, nth_root-1);
+
+            let num: Montgomery<u64> = Montgomery(self.montgomery.one().value() + self.q - self.montgomery.mul_internal(psi_pow, psi_mont).value());
+
+            let mut den: Montgomery<u64> = self.montgomery.mul_internal(nth_root_mont, psi_pow);
+
+            den = self.montgomery.pow(den, self.phi-1);
+
+            psi_mont = self.montgomery.add_internal(psi_mont, self.montgomery.mul_internal(num, den));
+        }
+        
+        self.montgomery.unprepare(psi_mont)
+    }
+}
+
+/// Returns x^exponent mod q.
+/// This function internally instantiate a new MontgomeryPrecomp<u64>
+/// To be used when called only a few times and if there 
+/// is no Prime instantiated with q.
+pub fn Pow(x:u64, exponent:u64, q:u64) -> u64{
+    let montgomery: MontgomeryPrecomp<u64> = MontgomeryPrecomp::<u64>::new(q);
+    let mut y_mont: Montgomery<u64> = montgomery.one();
+    let mut x_mont: Montgomery<u64> = montgomery.prepare(x);
+    let mut i: u64 = exponent;
+    while i > 0{
+        if i & 1 == 1{
+            montgomery.mul_internal_assign(x_mont, &mut y_mont);
+        }
+
+        montgomery.mul_internal_assign(x_mont, &mut x_mont);
+
+        i >>= 1;
+    }
+    
+    montgomery.unprepare(y_mont)
+}
--- a/src/modulus/impl_u64/shoup.rs
+++ b/src/modulus/impl_u64/shoup.rs
@@ -0,0 +1,60 @@
+use crate::modulus::ReduceOnce;
+use crate::modulus::shoup::{ShoupPrecomp, Shoup};
+
+impl ShoupPrecomp<u64>{
+
+    pub fn new(q: u64) -> Self {
+        let mut precomp: ShoupPrecomp<u64> = Self{q:q, one:Shoup(0,0)};
+        precomp.one = precomp.prepare(1);
+        precomp
+    }
+
+    #[inline(always)]
+    pub fn one(&self) -> Shoup<u64> {
+        self.one
+    }
+
+    #[inline(always)]
+    pub fn prepare(&self, v: u64) -> Shoup<u64> {
+        debug_assert!(v < self.q);
+        let quotient: u64 = (((v as u128) << 64) / self.q as u128) as _;
+        Shoup(v, quotient)
+    }
+
+    #[inline(always)]
+    pub fn mul_external(&self, lhs: Shoup<u64>, rhs: u64) -> u64 {
+        let mut r: u64 = self.mul_external_lazy(lhs, rhs);
+        r.reduce_once_assign(self.q);
+        r
+    }
+
+    #[inline(always)]
+    pub fn mul_external_assign(&self, lhs: Shoup<u64>, rhs: &mut u64){
+        self.mul_external_lazy_assign(lhs, rhs);
+        rhs.reduce_once_assign(self.q);
+    }
+
+    #[inline(always)]
+    pub fn mul_external_lazy(&self, lhs: Shoup<u64>, rhs: u64) -> u64 {
+        let mut r: u64 = rhs;
+        self.mul_external_lazy_assign(lhs, &mut r);
+        r
+    }
+
+    #[inline(always)]
+    pub fn mul_external_lazy_assign(&self, lhs: Shoup<u64>, rhs: &mut u64){
+        let t: u64 = ((*lhs.quotient() as u128 * *rhs as u128) >> 64) as _;
+        *rhs = (rhs.wrapping_mul(*lhs.value())).wrapping_sub(self.q.wrapping_mul(t));
+    }
+
+    #[inline(always)]
+    pub fn reduce_assign(&self, rhs: &mut u64){
+        self.reduce_assign_lazy(rhs);
+        rhs.reduce_once_assign(self.q);
+    }
+
+    #[inline(always)]
+    pub fn reduce_assign_lazy(&self, rhs: &mut u64){
+        *rhs = rhs.wrapping_sub(self.q.wrapping_mul(((self.one.1 as u128 * *rhs as u128) >> 64) as _))
+    }
+}
--- a/src/modulus/montgomery.rs
+++ b/src/modulus/montgomery.rs
@@ -1,7 +1,4 @@
 use crate::modulus::barrett::BarrettPrecomp;
-use crate::modulus::ReduceOnce;
-
-extern crate test;

 /// Montgomery is a generic struct storing
 /// an element in the Montgomery domain.
@@ -30,249 +27,9 @@ impl<O> Montgomery<O>{
 /// precomputations for Montgomery arithmetic.
 #[derive(Clone, Copy, Debug, PartialEq, Eq)]
 pub struct MontgomeryPrecomp<O>{
-    q: O,
-    barrett: BarrettPrecomp<O>,
-    q_inv: O,
-    one: Montgomery<O>,
-    minus_one: Montgomery<O>,
+    pub q: O,
+    pub barrett: BarrettPrecomp<O>,
+    pub q_inv: O,
+    pub one: Montgomery<O>,
+    pub minus_one: Montgomery<O>,
 }
-
-/// MontgomeryPrecomp is a set of methods implemented for MontgomeryPrecomp<u64>
-/// enabling Montgomery arithmetic over u64 values.
-impl MontgomeryPrecomp<u64>{
-
-    /// Returns an new instance of MontgomeryPrecomp<u64>.
-    /// This method will fail if gcd(q, 2^64) != 1.
-    #[inline(always)]
-    pub fn new(q: u64) -> MontgomeryPrecomp<u64>{
-        assert!(q & 1 != 0, "Invalid argument: gcd(q={}, radix=2^64) != 1", q);
-        let mut q_inv: u64 = 1;
-        let mut q_pow = q;
-        for _i in 0..63{
-            q_inv = q_inv.wrapping_mul(q_pow);
-            q_pow = q_pow.wrapping_mul(q_pow);
-        }
-        let mut precomp = Self{ 
-            q: q, 
-            barrett: BarrettPrecomp::new(q), 
-            q_inv: q_inv,
-            one: Montgomery(0),
-            minus_one: Montgomery(0),
-        };
-
-        precomp.one = precomp.prepare(1);
-        precomp.minus_one = Montgomery(q-precomp.one.value());
-
-        precomp
-    }
-
-    /// Returns 2^64 mod q as a Montgomery<u64>.
-    #[inline(always)]
-    pub fn one(&self) -> Montgomery<u64>{
-        self.one
-    }
-
-    /// Returns (q-1) * 2^64 mod q as a Montgomery<u64>.
-    #[inline(always)]
-    pub fn minus_one(&self) -> Montgomery<u64>{
-        self.minus_one
-    }
-
-    /// Returns lhs * 2^64 mod q as a Montgomery<u64>.
-    #[inline(always)]
-    pub fn prepare(&self, lhs: u64) -> Montgomery<u64>{
-        let mut rhs = Montgomery(0);
-        self.prepare_assign(lhs, &mut rhs);
-        rhs
-    }
-
-    /// Assigns lhs * 2^64 mod q to rhs.
-    #[inline(always)]
-    pub fn prepare_assign(&self, lhs: u64, rhs: &mut Montgomery<u64>){
-        self.prepare_lazy_assign(lhs, rhs);
-        rhs.value_mut().reduce_once_assign(self.q);
-    }
-
-    /// Returns lhs * 2^64 mod q in range [0, 2q-1] as a Montgomery<u64>.
-    #[inline(always)]
-    pub fn prepare_lazy(&self, lhs: u64) -> Montgomery<u64>{
-        let mut rhs = Montgomery(0);
-        self.prepare_lazy_assign(lhs, &mut rhs);
-        rhs
-    }
-
-    /// Assigns lhs * 2^64 mod q in range [0, 2q-1] to rhs.
-    #[inline(always)]
-    pub fn prepare_lazy_assign(&self, lhs: u64, rhs: &mut Montgomery<u64>){
-        let (_, mhi) = lhs.widening_mul(*self.barrett.value_lo());
-        *rhs = Montgomery((lhs.wrapping_mul(*self.barrett.value_hi()).wrapping_add(mhi)).wrapping_mul(self.q).wrapping_neg());
-    }
-
-    /// Returns lhs * (2^64)^-1 mod q as a u64.
-    #[inline(always)]
-    pub fn unprepare(&self, lhs: Montgomery<u64>) -> u64{
-        let mut rhs = 0u64;
-        self.unprepare_assign(lhs, &mut rhs);
-        rhs
-    }
-
-    /// Assigns lhs * (2^64)^-1 mod q to rhs.
-    #[inline(always)]
-    pub fn unprepare_assign(&self, lhs: Montgomery<u64>, rhs: &mut u64){
-        self.unprepare_lazy_assign(lhs, rhs);
-        rhs.reduce_once_assign(self.q);
-        
-    }
-
-    /// Returns lhs * (2^64)^-1 mod q in range [0, 2q-1].
-    #[inline(always)]
-    pub fn unprepare_lazy(&self, lhs: Montgomery<u64>) -> u64{
-        let mut rhs = 0u64;
-        self.unprepare_lazy_assign(lhs, &mut rhs);
-        rhs
-    }
-
-    /// Assigns lhs * (2^64)^-1 mod q in range [0, 2q-1] to rhs.
-    #[inline(always)]
-    pub fn unprepare_lazy_assign(&self, lhs: Montgomery<u64>, rhs: &mut u64){
-        let (_, r) = self.q.widening_mul(lhs.value().wrapping_mul(self.q_inv));
-        *rhs = self.q - r
-    }
-
-    /// Returns lhs * rhs * (2^{64})^-1 mod q.
-    #[inline(always)]
-    pub fn mul_external(&self, lhs: Montgomery<u64>, rhs: u64) -> u64{
-        let mut r = self.mul_external_lazy(lhs, rhs);
-        r.reduce_once_assign(self.q);
-        r
-    }
-
-    /// Assigns lhs * rhs * (2^{64})^-1 mod q to rhs.
-    #[inline(always)]
-    pub fn mul_external_assign(&self, lhs: Montgomery<u64>, rhs: &mut u64){
-        self.mul_external_lazy_assign(lhs, rhs);
-        rhs.reduce_once_assign(self.q);
-    }
-
-    /// Returns lhs * rhs * (2^{64})^-1 mod q in range [0, 2q-1].
-    #[inline(always)]
-    pub fn mul_external_lazy(&self, lhs: Montgomery<u64>, rhs: u64) -> u64{
-        let mut result = rhs;
-        self.mul_external_lazy_assign(lhs, &mut result);
-        result
-    }
-
-    /// Assigns lhs * rhs * (2^{64})^-1 mod q in range [0, 2q-1] to rhs.
-    #[inline(always)]
-    pub fn mul_external_lazy_assign(&self, lhs: Montgomery<u64>, rhs: &mut u64){
-        let (mlo, mhi) = lhs.value().widening_mul(*rhs);
-        let (_, hhi) = self.q.widening_mul(mlo.wrapping_mul(self.q_inv));
-        *rhs = mhi.wrapping_add(self.q - hhi)
-    }
-
-    /// Returns lhs * rhs * (2^{64})^-1 mod q in range [0, 2q-1].
-    #[inline(always)]
-    pub fn mul_internal(&self, lhs: Montgomery<u64>, rhs: Montgomery<u64>) -> Montgomery<u64>{
-        Montgomery(self.mul_external(lhs, *rhs.value()))
-    }
-
-    /// Assigns lhs * rhs * (2^{64})^-1 mod q to rhs.
-    #[inline(always)]
-    pub fn mul_internal_assign(&self, lhs: Montgomery<u64>, rhs: &mut Montgomery<u64>){
-        self.mul_external_assign(lhs, rhs.value_mut());
-    }
-
-    /// Returns lhs * rhs * (2^{64})^-1 mod q in range [0, 2q-1].
-    #[inline(always)]
-    pub fn mul_internal_lazy(&self, lhs: Montgomery<u64>, rhs: Montgomery<u64>) -> Montgomery<u64>{
-        Montgomery(self.mul_external_lazy(lhs, *rhs.value()))
-    }
-
-    /// Assigns lhs * rhs * (2^{64})^-1 mod q in range [0, 2q-1] to rhs.
-    #[inline(always)]
-    pub fn mul_internal_lazy_assign(&self, lhs: Montgomery<u64>, rhs: &mut Montgomery<u64>){
-        self.mul_external_lazy_assign(lhs, rhs.value_mut());
-    }
-
-    #[inline(always)]
-    pub fn add_internal(&self, lhs: Montgomery<u64>, rhs: Montgomery<u64>) -> Montgomery<u64>{
-        Montgomery(self.barrett.reduce(rhs.value() + lhs.value()))
-    }
-
-    /// Assigns lhs + rhs to rhs.
-    #[inline(always)]
-    pub fn add_internal_lazy_assign(&self, lhs: Montgomery<u64>, rhs: &mut Montgomery<u64>){
-        *rhs.value_mut() += lhs.value()
-    }
-
-    /// Assigns lhs + rhs - q if (lhs + rhs) >= q to rhs.
-    #[inline(always)]
-    pub fn add_internal_reduce_once_assign(&self, lhs: Montgomery<u64>, rhs: &mut Montgomery<u64>){
-        self.add_internal_lazy_assign(lhs, rhs);
-        rhs.value_mut().reduce_once_assign(self.q);
-    }
-
-    /// Returns (x^exponent) * 2^64 mod q.
-    #[inline(always)]
-    pub fn pow(&self, x: Montgomery<u64>, exponent:u64) -> Montgomery<u64>{
-        let mut y: Montgomery<u64> = self.one();
-        let mut x_mut: Montgomery<u64> = x;
-        let mut i: u64 = exponent;
-        while i > 0{
-            if i & 1 == 1{
-                self.mul_internal_assign(x_mut, &mut y);
-            }
-            self.mul_internal_assign(x_mut, &mut x_mut);
-            i >>= 1;
-        }
-
-        y.value_mut().reduce_once_assign(self.q);
-        y
-    }
-}
-
-/// Returns x^exponent mod q.
-/// This function internally instantiate a new MontgomeryPrecomp<u64>
-/// To be used when called only a few times and if there 
-/// is no Prime instantiated with q.
-fn pow(x:u64, exponent:u64, q:u64) -> u64{
-    let montgomery: MontgomeryPrecomp<u64> = MontgomeryPrecomp::<u64>::new(q);
-    let mut y_mont: Montgomery<u64> = montgomery.one();
-    let mut x_mont: Montgomery<u64> = montgomery.prepare(x);
-    while exponent > 0{
-        if exponent & 1 == 1{
-            montgomery.mul_internal_assign(x_mont, &mut y_mont);
-        }
-
-        montgomery.mul_internal_assign(x_mont, &mut x_mont);
-    }
-    
-    montgomery.unprepare(y_mont)
-}
-
-#[cfg(test)]
-mod tests {
-    use crate::modulus::montgomery;
-    use super::*;
-    use test::Bencher;
-
-    #[test]
-    fn test_mul_external() {
-        let q: u64 = 0x1fffffffffe00001;
-    	let m_precomp = montgomery::MontgomeryPrecomp::new(q);
-        let x: u64 = 0x5f876e514845cc8b;
-        let y: u64 = 0xad726f98f24a761a;
-        let y_mont = m_precomp.prepare(y);
-    	assert!(m_precomp.mul_external(y_mont, x) == (x as u128 * y as u128 % q as u128) as u64);
-    }
-
-    #[bench]
-    fn bench_mul_external(b: &mut Bencher){
-        let q: u64 = 0x1fffffffffe00001;
-    	let m_precomp = montgomery::MontgomeryPrecomp::new(q);
-        let mut x: u64 = 0x5f876e514845cc8b;
-        let y: u64 = 0xad726f98f24a761a;
-        let y_mont = m_precomp.prepare(y);
-        b.iter(|| m_precomp.mul_external_assign(y_mont, &mut x));
-    }
-}
--- a/src/modulus/prime.rs
+++ b/src/modulus/prime.rs
@@ -1,7 +1,6 @@
 use crate::modulus::montgomery::{MontgomeryPrecomp, Montgomery};
-use crate::dft::ntt::Table;
-use primality_test::is_prime;
-use prime_factorization::Factorization;
+use crate::modulus::shoup::{ShoupPrecomp};
+

 pub struct Prime<O> {
 	pub q: O, /// q_base^q_powers
@@ -9,222 +8,14 @@ pub struct Prime<O> {
    pub q_power: O,
    pub factors: Vec<O>, /// distinct factors of q-1
    pub montgomery: MontgomeryPrecomp<O>,
+    pub shoup:ShoupPrecomp<O>,
    pub phi: O,
 }

 pub struct NTTFriendlyPrimesGenerator<O>{
-	size: f64,
-	next_prime: O,
-	prev_prime: O,
-	check_next_prime: bool,
-	check_prev_prime: bool,
-}
-
-impl Prime<u64>{
-
-    /// Returns a new instance of Prime<u64>.
-    /// Panics if q_base is not a prime > 2 and
-    /// if q_base^q_power would overflow u64.
-    pub fn new(q_base: u64, q_power: u64) -> Self{
-		assert!(is_prime(q_base) && q_base > 2);
-        Self::new_unchecked(q_base, q_power)
-	}
-
-    /// Returns a new instance of Prime<u64>.
-    /// Does not check if q_base is a prime > 2.
-    /// Panics if q_base^q_power would overflow u64.
-	pub fn new_unchecked(q_base: u64, q_power: u64) -> Self {
-
-        let mut q = q_base;
-        for _i in 1..q_power{
-            q *= q_base
-        }
-
-        assert!(q.next_power_of_two().ilog2() <= 61);
-
-        let mut phi = q_base-1;
-        for _i in 1..q_power{
-            phi *= q_base
-        }
-
-        Self {
-            q:q,
-            q_base:q_base,
-            q_power:q_power,
-            factors: Vec::new(),
-            montgomery:MontgomeryPrecomp::new(q),
-            phi:phi,
-        }
-    }
-
-    pub fn q(&self) -> u64{
-        self.q
-    }
-
-    pub fn q_base(&self) -> u64{
-        self.q_base
-    }
-
-    pub fn q_power(&self) -> u64{
-        self.q_power
-    }
-
-    /// Returns x^exponen mod q.
-    #[inline(always)]
-    pub fn pow(&self, x: u64, exponent: u64) -> u64{
-        let mut y_mont: Montgomery<u64> = self.montgomery.one();
-        let mut x_mont: Montgomery<u64> = self.montgomery.prepare(x);
-        let mut i: u64 = exponent;
-        while i > 0{
-            if i & 1 == 1{
-                self.montgomery.mul_internal_assign(x_mont, &mut y_mont);
-            }
-
-            self.montgomery.mul_internal_assign(x_mont, &mut x_mont);
-
-            i >>= 1;
-
-            
-        }
-        
-        self.montgomery.unprepare(y_mont)
-    }
-
-    /// Returns x^-1 mod q.
-    /// User must ensure that x is not divisible by q_base.
-    #[inline(always)]
-    pub fn inv(&self, x: u64) -> u64{
-        self.pow(x, self.phi)
-    }
-}
-
-/// Returns x^exponent mod q.
-/// This function internally instantiate a new MontgomeryPrecomp<u64>
-/// To be used when called only a few times and if there 
-/// is no Prime instantiated with q.
-pub fn Pow(x:u64, exponent:u64, q:u64) -> u64{
-    let montgomery: MontgomeryPrecomp<u64> = MontgomeryPrecomp::<u64>::new(q);
-    let mut y_mont: Montgomery<u64> = montgomery.one();
-    let mut x_mont: Montgomery<u64> = montgomery.prepare(x);
-    let mut i: u64 = exponent;
-    while i > 0{
-        if i & 1 == 1{
-            montgomery.mul_internal_assign(x_mont, &mut y_mont);
-        }
-
-        montgomery.mul_internal_assign(x_mont, &mut x_mont);
-
-        i >>= 1;
-    }
-    
-    montgomery.unprepare(y_mont)
-}
-
-impl Prime<u64>{
-    /// Returns the smallest nth primitive root of q_base.
-    pub fn primitive_root(&mut self) -> u64{
-
-        self.check_factors();
-
-        let mut candidate: u64 = 1u64;
-        let mut not_found: bool = true;
-
-        while not_found{
-
-            candidate += 1;
-
-            for &factor in &self.factors{
-
-                if  Pow(candidate, (self.q_base-1)/factor, self.q_base) == 1{
-                    not_found = true;
-                    break
-                }
-                not_found = false;
-            }
-        }
-
-        if not_found{
-            panic!("failed to find a primitive root for q_base={}", self.q_base)
-        }
-
-        candidate
-    }
-
-    /// Returns an nth primitive root of q = q_base^q_power in Montgomery.
-    pub fn primitive_nth_root(&mut self, nth_root:u64) -> u64{
-
-        assert!(self.q & (nth_root-1) == 1, "invalid prime: q = {} % nth_root = {} = {} != 1", self.q, nth_root, self.q & (nth_root-1));
-
-        let psi: u64 = self.primitive_root();
-
-        // nth primitive root mod q_base: psi_nth^(prime.q_base-1)/nth_root mod q_base
-        let psi_nth_q_base: u64 = Pow(psi, (self.q_base-1)/nth_root, self.q_base);
-
-        // lifts nth primitive root mod q_base to q = q_base^q_power
-        let psi_nth_q: u64 = self.hensel_lift(psi_nth_q_base, nth_root);
-
-        assert!(self.pow(psi_nth_q, nth_root) == 1, "invalid nth primitive root: psi^nth_root != 1 mod q");
-        assert!(self.pow(psi_nth_q, nth_root>>1) == self.q-1, "invalid nth primitive root: psi^(nth_root/2) != -1 mod q");
-
-        psi_nth_q
-    }
-
-    /// Checks if the field self.factor is populated.
-    /// If not, factorize q_base-1 and populates self.factor.
-    /// If yes, checks that it contains the unique factors of q_base-1.
-    pub fn check_factors(&mut self){
-
-        if self.factors.len() == 0{
-
-            let factors = Factorization::run(self.q_base-1).prime_factor_repr();
-            let mut distincts_factors: Vec<u64> = Vec::with_capacity(factors.len());
-            for factor in factors.iter(){
-                distincts_factors.push(factor.0)
-            }
-            self.factors = distincts_factors
-            
-        }else{
-            let mut q_base: u64 = self.q_base;
-
-            for &factor in &self.factors{
-                if !is_prime(factor){
-                    panic!("invalid factor list: factor {} is not prime", factor)
-                }
-    
-                while q_base%factor != 0{
-                    q_base /= factor
-                }
-            }
-    
-            if q_base != 1{
-                panic!("invalid factor list: does not fully divide q_base: q_base % (all factors) = {}", q_base)
-            }
-        }
-
-        
-    }
-
-    /// Returns (psi + a * q_base)^{nth_root} = 1 mod q = q_base^q_power given psi^{nth_root} = 1 mod q_base.
-    /// Panics if psi^{nth_root} != 1 mod q_base.
-    fn hensel_lift(&self, psi: u64, nth_root: u64) -> u64{
-        assert!(Pow(psi, nth_root, self.q_base)==1, "invalid argument psi: psi^nth_root = {} != 1", Pow(psi, nth_root, self.q_base));
-
-        let mut psi_mont: Montgomery<u64> = self.montgomery.prepare(psi);
-        let nth_root_mont: Montgomery<u64> = self.montgomery.prepare(nth_root);
-
-        for _i in 1..self.q_power{
-
-            let psi_pow: Montgomery<u64> = self.montgomery.pow(psi_mont, nth_root-1);
-
-            let num: Montgomery<u64> = Montgomery(self.montgomery.one().value() + self.q - self.montgomery.mul_internal(psi_pow, psi_mont).value());
-
-            let mut den: Montgomery<u64> = self.montgomery.mul_internal(nth_root_mont, psi_pow);
-
-            den = self.montgomery.pow(den, self.phi-1);
-
-            psi_mont = self.montgomery.add_internal(psi_mont, self.montgomery.mul_internal(num, den));
-        }
-        
-        self.montgomery.unprepare(psi_mont)
-    }
+	pub size: f64,
+	pub next_prime: O,
+	pub prev_prime: O,
+	pub check_next_prime: bool,
+	pub check_prev_prime: bool,
 }
--- a/src/modulus/shoup.rs
+++ b/src/modulus/shoup.rs
@@ -0,0 +1,21 @@
+#[derive(Clone, Copy, Debug, PartialEq, Eq)]
+pub struct Shoup<O>(pub O, pub O);
+
+impl<O> Shoup<O> {
+    
+    #[inline(always)]
+    pub fn value(&self) -> &O {
+        &self.0
+    }
+
+    #[inline(always)]
+    pub fn quotient(&self) -> &O {
+        &self.1
+    }
+}
+
+pub struct ShoupPrecomp<O>{
+    pub q: O,
+    pub one: Shoup<O>,
+}
+
--- a/src/poly.rs
+++ b/src/poly.rs
@@ -0,0 +1,22 @@
+pub mod poly;
+
+pub struct Poly<O>(pub Vec<O>);
+
+impl Poly<u64>{
+    pub fn new(n: usize) -> Self{
+        Self(vec![0u64;n])
+    }
+
+    pub fn buffer_size(&self) -> usize{
+        return self.0.len()
+    }
+
+    pub fn from_buffer(&mut self, n: usize, buf: &mut [u64]){
+        assert!(buf.len() >= n, "invalid buffer: buf.len()={} < n={}", buf.len(), n);
+        self.0 = Vec::from(&buf[..n]);
+    }
+
+    pub fn n(&self) -> usize{
+        return self.0.len()
+    }
+}
--- a/src/poly/poly.rs
+++ b/src/poly/poly.rs
--- a/src/ring.rs
+++ b/src/ring.rs
@@ -0,0 +1,11 @@
+pub mod impl_u64;
+
+use crate::modulus::prime::Prime;
+use crate::dft::DFT;
+
+
+pub struct Ring<O>{
+    pub n:usize,
+    pub modulus:Prime<O>,
+    pub dft:dyn DFT<O>,
+}
--- a/src/ring/impl_u64/automorphism.rs
+++ b/src/ring/impl_u64/automorphism.rs
@@ -0,0 +1,43 @@
+use crate::modulus::WordOps;
+use crate::ring::Ring;
+use crate::poly::Poly;
+
+/// Returns a lookup table for the automorphism X^{i} -> X^{i * k mod nth_root}.
+/// Method will panic if n or nth_root are not power-of-two.
+/// Method will panic if gal_el is not coprime with nth_root.
+pub fn automorphism_index_ntt(n: usize, nth_root:u64, gal_el: u64) -> (Vec<u64>){
+    assert!(n&(n-1) != 0, "invalid n={}: not a power-of-two", n);
+    assert!(nth_root&(nth_root-1) != 0, "invalid nth_root={}: not a power-of-two", n);
+    assert!(gal_el & 1 == 1, "invalid gal_el={}: not coprime with nth_root={}", gal_el, nth_root);
+
+    let mask = nth_root-1;
+    let log_nth_root: u32 = nth_root.log2() as u32;
+    let mut index: Vec<u64> = Vec::with_capacity(n);
+    for i in 0..n{
+        let i_rev: usize = 2*i.reverse_bits_msb(log_nth_root)+1;
+        let gal_el_i: u64 = (gal_el * (i_rev as u64) & mask)>>1;
+        index.push(gal_el_i.reverse_bits_msb(log_nth_root));
+    }
+    index
+}
+
+impl Ring<u64>{
+    pub fn automorphism(&self, a:Poly<u64>, gal_el: u64, b:&mut Poly<u64>){
+        debug_assert!(a.n() == b.n(), "invalid inputs: a.n() = {} != b.n() = {}", a.n(), b.n());
+        debug_assert!(gal_el&1 == 1, "invalid gal_el = {}: not odd", gal_el);
+
+        let n: usize = a.n();
+        let mask: u64 = (n-1) as u64;
+        let log_n: usize = n.log2();
+        let q: u64 = self.modulus.q();
+        let b_vec: &mut _ = &mut b.0;
+        let a_vec: &_ = &a.0;
+
+        for i in 0..n{
+            let i_in: u64 = i as u64 * gal_el;
+            let i_out: u64 = i_in & mask;
+            let sign: u64 = (i_in>>log_n) & 1;
+            b_vec[i_out as usize] = a_vec[i_in as usize] * (sign^1) | (q - a_vec[i_in as usize]) * sign
+        }
+    }
+}
--- a/src/ring/impl_u64/mod.rs
+++ b/src/ring/impl_u64/mod.rs
@@ -0,0 +1 @@
+pub mod automorphism;