Faster ff arithmetics (regenerated code with the newest goff) (#43)

3 years ago · 69354ae29c
17 changed files with 4215 additions and 739 deletions
--- a/babyjub/babyjub.go
+++ b/babyjub/babyjub.go
@ -95,20 +95,20 @@ func (p *PointProjective) Add(q *PointProjective, o *PointProjective) *PointProj
 	c := ff.NewElement().Mul(q.X, o.X)
 	d := ff.NewElement().Mul(q.Y, o.Y)
 	e := ff.NewElement().Mul(Dff, c)
 	e.MulAssign(d)
 	e.Mul(e, d)
 	f := ff.NewElement().Sub(b, e)
 	g := ff.NewElement().Add(b, e)
 	x1y1 := ff.NewElement().Add(q.X, q.Y)
 	x2y2 := ff.NewElement().Add(o.X, o.Y)
 	x3 := ff.NewElement().Mul(x1y1, x2y2)
 	x3.SubAssign(c)
 	x3.SubAssign(d)
 	x3.MulAssign(a)
 	x3.MulAssign(f)
 	x3.Sub(x3, c)
 	x3.Sub(x3, d)
 	x3.Mul(x3, a)
 	x3.Mul(x3, f)
 	ac := ff.NewElement().Mul(Aff, c)
 	y3 := ff.NewElement().Sub(d, ac)
 	y3.MulAssign(a)
 	y3.MulAssign(g)
 	y3.Mul(y3, a)
 	y3.Mul(y3, g)
 	z3 := ff.NewElement().Mul(f, g)

 	p.X = x3
--- a/ff/arith.go
+++ b/ff/arith.go
@ -1,4 +1,4 @@
 // Copyright 2020 ConsenSys AG
 // Copyright 2020 ConsenSys Software Inc.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

 // Code generated by goff DO NOT EDIT
 // Code generated by consensys/gnark-crypto DO NOT EDIT

 package ff

@ -20,15 +20,6 @@ import (
 	"math/bits"
 )

 func madd(a, b, t, u, v uint64) (uint64, uint64, uint64) {
 	var carry uint64
 	hi, lo := bits.Mul64(a, b)
 	v, carry = bits.Add64(lo, v, 0)
 	u, carry = bits.Add64(hi, u, carry)
 	t, _ = bits.Add64(t, 0, carry)
 	return t, u, v
 }

 // madd0 hi = a*b + c (discards lo bits)
 func madd0(a, b, c uint64) (hi uint64) {
 	var carry, lo uint64
@ -58,59 +49,6 @@ func madd2(a, b, c, d uint64) (hi uint64, lo uint64) {
 	return
 }

 // madd2s superhi, hi, lo = 2*a*b + c + d + e
 func madd2s(a, b, c, d, e uint64) (superhi, hi, lo uint64) {
 	var carry, sum uint64

 	hi, lo = bits.Mul64(a, b)
 	lo, carry = bits.Add64(lo, lo, 0)
 	hi, superhi = bits.Add64(hi, hi, carry)

 	sum, carry = bits.Add64(c, e, 0)
 	hi, _ = bits.Add64(hi, 0, carry)
 	lo, carry = bits.Add64(lo, sum, 0)
 	hi, _ = bits.Add64(hi, 0, carry)
 	hi, _ = bits.Add64(hi, 0, d)
 	return
 }

 func madd1s(a, b, d, e uint64) (superhi, hi, lo uint64) {
 	var carry uint64

 	hi, lo = bits.Mul64(a, b)
 	lo, carry = bits.Add64(lo, lo, 0)
 	hi, superhi = bits.Add64(hi, hi, carry)
 	lo, carry = bits.Add64(lo, e, 0)
 	hi, _ = bits.Add64(hi, 0, carry)
 	hi, _ = bits.Add64(hi, 0, d)
 	return
 }

 func madd2sb(a, b, c, e uint64) (superhi, hi, lo uint64) {
 	var carry, sum uint64

 	hi, lo = bits.Mul64(a, b)
 	lo, carry = bits.Add64(lo, lo, 0)
 	hi, superhi = bits.Add64(hi, hi, carry)

 	sum, carry = bits.Add64(c, e, 0)
 	hi, _ = bits.Add64(hi, 0, carry)
 	lo, carry = bits.Add64(lo, sum, 0)
 	hi, _ = bits.Add64(hi, 0, carry)
 	return
 }

 func madd1sb(a, b, e uint64) (superhi, hi, lo uint64) {
 	var carry uint64

 	hi, lo = bits.Mul64(a, b)
 	lo, carry = bits.Add64(lo, lo, 0)
 	hi, superhi = bits.Add64(hi, hi, carry)
 	lo, carry = bits.Add64(lo, e, 0)
 	hi, _ = bits.Add64(hi, 0, carry)
 	return
 }

 func madd3(a, b, c, d, e uint64) (hi uint64, lo uint64) {
 	var carry uint64
 	hi, lo = bits.Mul64(a, b)
--- a/ff/asm.go
+++ b/ff/asm.go
@ -0,0 +1,24 @@
 //go:build !noadx
 // +build !noadx

 // Copyright 2020 ConsenSys Software Inc.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.

 // Code generated by consensys/gnark-crypto DO NOT EDIT

 package ff

 import "golang.org/x/sys/cpu"

 var supportAdx = cpu.X86.HasADX && cpu.X86.HasBMI2
--- a/ff/asm_noadx.go
+++ b/ff/asm_noadx.go
@ -0,0 +1,25 @@
 //go:build noadx
 // +build noadx

 // Copyright 2020 ConsenSys Software Inc.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.

 // Code generated by consensys/gnark-crypto DO NOT EDIT

 package ff

 // note: this is needed for test purposes, as dynamically changing supportAdx doesn't flag
 // certain errors (like fatal error: missing stackmap)
 // this ensures we test all asm path.
 var supportAdx = false
--- a/ff/doc.go
+++ b/ff/doc.go
@ -0,0 +1,43 @@
 // Copyright 2020 ConsenSys Software Inc.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.

 // Code generated by consensys/gnark-crypto DO NOT EDIT

 // Package ff contains field arithmetic operations for modulus = 0x30644e...000001.
 //
 // The API is similar to math/big (big.Int), but the operations are significantly faster (up to 20x for the modular multiplication on amd64, see also https://hackmd.io/@zkteam/modular_multiplication)
 //
 // The modulus is hardcoded in all the operations.
 //
 // Field elements are represented as an array, and assumed to be in Montgomery form in all methods:
 // 	type Element [4]uint64
 //
 // Example API signature
 // 	// Mul z = x * y mod q
 // 	func (z *Element) Mul(x, y *Element) *Element
 //
 // and can be used like so:
 // 	var a, b Element
 // 	a.SetUint64(2)
 // 	b.SetString("984896738")
 // 	a.Mul(a, b)
 // 	a.Sub(a, a)
 // 	 .Add(a, b)
 // 	 .Inv(a)
 // 	b.Exp(b, new(big.Int).SetUint64(42))
 //
 // Modulus
 // 	0x30644e72e131a029b85045b68181585d2833e84879b9709143e1f593f0000001 // base 16
 // 	21888242871839275222246405745257275088548364400416034343698204186575808495617 // base 10
 package ff
--- a/ff/element.go
+++ b/ff/element.go
--- a/ff/element_fuzz.go
+++ b/ff/element_fuzz.go
@ -0,0 +1,136 @@
 //go:build gofuzz
 // +build gofuzz

 // Copyright 2020 ConsenSys Software Inc.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.

 // Code generated by consensys/gnark-crypto DO NOT EDIT

 package ff

 import (
 	"bytes"
 	"encoding/binary"
 	"io"
 	"math/big"
 	"math/bits"
 )

 const (
 	fuzzInteresting = 1
 	fuzzNormal      = 0
 	fuzzDiscard     = -1
 )

 // Fuzz arithmetic operations fuzzer
 func Fuzz(data []byte) int {
 	r := bytes.NewReader(data)

 	var e1, e2 Element
 	e1.SetRawBytes(r)
 	e2.SetRawBytes(r)

 	{
 		// mul assembly

 		var c, _c Element
 		a, _a, b, _b := e1, e1, e2, e2
 		c.Mul(&a, &b)
 		_mulGeneric(&_c, &_a, &_b)

 		if !c.Equal(&_c) {
 			panic("mul asm != mul generic on Element")
 		}
 	}

 	{
 		// inverse
 		inv := e1
 		inv.Inverse(&inv)

 		var bInv, b1, b2 big.Int
 		e1.ToBigIntRegular(&b1)
 		bInv.ModInverse(&b1, Modulus())
 		inv.ToBigIntRegular(&b2)

 		if b2.Cmp(&bInv) != 0 {
 			panic("inverse operation doesn't match big int result")
 		}
 	}

 	{
 		// a + -a == 0
 		a, b := e1, e1
 		b.Neg(&b)
 		a.Add(&a, &b)
 		if !a.IsZero() {
 			panic("a + -a != 0")
 		}
 	}

 	return fuzzNormal

 }

 // SetRawBytes reads up to Bytes (bytes needed to represent Element) from reader
 // and interpret it as big endian uint64
 // used for fuzzing purposes only
 func (z *Element) SetRawBytes(r io.Reader) {

 	buf := make([]byte, 8)

 	for i := 0; i < len(z); i++ {
 		if _, err := io.ReadFull(r, buf); err != nil {
 			goto eof
 		}
 		z[i] = binary.BigEndian.Uint64(buf[:])
 	}
 eof:
 	z[3] %= qElement[3]

 	if z.BiggerModulus() {
 		var b uint64
 		z[0], b = bits.Sub64(z[0], qElement[0], 0)
 		z[1], b = bits.Sub64(z[1], qElement[1], b)
 		z[2], b = bits.Sub64(z[2], qElement[2], b)
 		z[3], b = bits.Sub64(z[3], qElement[3], b)
 	}

 	return
 }

 func (z *Element) BiggerModulus() bool {
 	if z[3] > qElement[3] {
 		return true
 	}
 	if z[3] < qElement[3] {
 		return false
 	}

 	if z[2] > qElement[2] {
 		return true
 	}
 	if z[2] < qElement[2] {
 		return false
 	}

 	if z[1] > qElement[1] {
 		return true
 	}
 	if z[1] < qElement[1] {
 		return false
 	}

 	return z[0] >= qElement[0]
 }
--- a/ff/element_mul_adx_amd64.s
+++ b/ff/element_mul_adx_amd64.s
@ -0,0 +1,466 @@
 // +build amd64_adx

 // Copyright 2020 ConsenSys Software Inc.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.

 #include "textflag.h"
 #include "funcdata.h"

 // modulus q
 DATA q<>+0(SB)/8, $0x43e1f593f0000001
 DATA q<>+8(SB)/8, $0x2833e84879b97091
 DATA q<>+16(SB)/8, $0xb85045b68181585d
 DATA q<>+24(SB)/8, $0x30644e72e131a029
 GLOBL q<>(SB), (RODATA+NOPTR), $32

 // qInv0 q'[0]
 DATA qInv0<>(SB)/8, $0xc2e1f593efffffff
 GLOBL qInv0<>(SB), (RODATA+NOPTR), $8

 #define REDUCE(ra0, ra1, ra2, ra3, rb0, rb1, rb2, rb3) \
 	MOVQ    ra0, rb0;        \
 	SUBQ    q<>(SB), ra0;    \
 	MOVQ    ra1, rb1;        \
 	SBBQ    q<>+8(SB), ra1;  \
 	MOVQ    ra2, rb2;        \
 	SBBQ    q<>+16(SB), ra2; \
 	MOVQ    ra3, rb3;        \
 	SBBQ    q<>+24(SB), ra3; \
 	CMOVQCS rb0, ra0;        \
 	CMOVQCS rb1, ra1;        \
 	CMOVQCS rb2, ra2;        \
 	CMOVQCS rb3, ra3;        \

 // mul(res, x, y *Element)
 TEXT ·mul(SB), NOSPLIT, $0-24

 	// the algorithm is described here
 	// https://hackmd.io/@zkteam/modular_multiplication
 	// however, to benefit from the ADCX and ADOX carry chains
 	// we split the inner loops in 2:
 	// for i=0 to N-1
 	// 		for j=0 to N-1
 	// 		    (A,t[j])  := t[j] + x[j]*y[i] + A
 	// 		m := t[0]*q'[0] mod W
 	// 		C,_ := t[0] + m*q[0]
 	// 		for j=1 to N-1
 	// 		    (C,t[j-1]) := t[j] + m*q[j] + C
 	// 		t[N-1] = C + A

 	MOVQ x+8(FP), SI

 	// x[0] -> DI
 	// x[1] -> R8
 	// x[2] -> R9
 	// x[3] -> R10
 	MOVQ 0(SI), DI
 	MOVQ 8(SI), R8
 	MOVQ 16(SI), R9
 	MOVQ 24(SI), R10
 	MOVQ y+16(FP), R11

 	// A -> BP
 	// t[0] -> R14
 	// t[1] -> R15
 	// t[2] -> CX
 	// t[3] -> BX
 	// clear the flags
 	XORQ AX, AX
 	MOVQ 0(R11), DX

 	// (A,t[0])  := x[0]*y[0] + A
 	MULXQ DI, R14, R15

 	// (A,t[1])  := x[1]*y[0] + A
 	MULXQ R8, AX, CX
 	ADOXQ AX, R15

 	// (A,t[2])  := x[2]*y[0] + A
 	MULXQ R9, AX, BX
 	ADOXQ AX, CX

 	// (A,t[3])  := x[3]*y[0] + A
 	MULXQ R10, AX, BP
 	ADOXQ AX, BX

 	// A += carries from ADCXQ and ADOXQ
 	MOVQ  $0, AX
 	ADOXQ AX, BP

 	// m := t[0]*q'[0] mod W
 	MOVQ  qInv0<>(SB), DX
 	IMULQ R14, DX

 	// clear the flags
 	XORQ AX, AX

 	// C,_ := t[0] + m*q[0]
 	MULXQ q<>+0(SB), AX, R12
 	ADCXQ R14, AX
 	MOVQ  R12, R14

 	// (C,t[0]) := t[1] + m*q[1] + C
 	ADCXQ R15, R14
 	MULXQ q<>+8(SB), AX, R15
 	ADOXQ AX, R14

 	// (C,t[1]) := t[2] + m*q[2] + C
 	ADCXQ CX, R15
 	MULXQ q<>+16(SB), AX, CX
 	ADOXQ AX, R15

 	// (C,t[2]) := t[3] + m*q[3] + C
 	ADCXQ BX, CX
 	MULXQ q<>+24(SB), AX, BX
 	ADOXQ AX, CX

 	// t[3] = C + A
 	MOVQ  $0, AX
 	ADCXQ AX, BX
 	ADOXQ BP, BX

 	// clear the flags
 	XORQ AX, AX
 	MOVQ 8(R11), DX

 	// (A,t[0])  := t[0] + x[0]*y[1] + A
 	MULXQ DI, AX, BP
 	ADOXQ AX, R14

 	// (A,t[1])  := t[1] + x[1]*y[1] + A
 	ADCXQ BP, R15
 	MULXQ R8, AX, BP
 	ADOXQ AX, R15

 	// (A,t[2])  := t[2] + x[2]*y[1] + A
 	ADCXQ BP, CX
 	MULXQ R9, AX, BP
 	ADOXQ AX, CX

 	// (A,t[3])  := t[3] + x[3]*y[1] + A
 	ADCXQ BP, BX
 	MULXQ R10, AX, BP
 	ADOXQ AX, BX

 	// A += carries from ADCXQ and ADOXQ
 	MOVQ  $0, AX
 	ADCXQ AX, BP
 	ADOXQ AX, BP

 	// m := t[0]*q'[0] mod W
 	MOVQ  qInv0<>(SB), DX
 	IMULQ R14, DX

 	// clear the flags
 	XORQ AX, AX

 	// C,_ := t[0] + m*q[0]
 	MULXQ q<>+0(SB), AX, R12
 	ADCXQ R14, AX
 	MOVQ  R12, R14

 	// (C,t[0]) := t[1] + m*q[1] + C
 	ADCXQ R15, R14
 	MULXQ q<>+8(SB), AX, R15
 	ADOXQ AX, R14

 	// (C,t[1]) := t[2] + m*q[2] + C
 	ADCXQ CX, R15
 	MULXQ q<>+16(SB), AX, CX
 	ADOXQ AX, R15

 	// (C,t[2]) := t[3] + m*q[3] + C
 	ADCXQ BX, CX
 	MULXQ q<>+24(SB), AX, BX
 	ADOXQ AX, CX

 	// t[3] = C + A
 	MOVQ  $0, AX
 	ADCXQ AX, BX
 	ADOXQ BP, BX

 	// clear the flags
 	XORQ AX, AX
 	MOVQ 16(R11), DX

 	// (A,t[0])  := t[0] + x[0]*y[2] + A
 	MULXQ DI, AX, BP
 	ADOXQ AX, R14

 	// (A,t[1])  := t[1] + x[1]*y[2] + A
 	ADCXQ BP, R15
 	MULXQ R8, AX, BP
 	ADOXQ AX, R15

 	// (A,t[2])  := t[2] + x[2]*y[2] + A
 	ADCXQ BP, CX
 	MULXQ R9, AX, BP
 	ADOXQ AX, CX

 	// (A,t[3])  := t[3] + x[3]*y[2] + A
 	ADCXQ BP, BX
 	MULXQ R10, AX, BP
 	ADOXQ AX, BX

 	// A += carries from ADCXQ and ADOXQ
 	MOVQ  $0, AX
 	ADCXQ AX, BP
 	ADOXQ AX, BP

 	// m := t[0]*q'[0] mod W
 	MOVQ  qInv0<>(SB), DX
 	IMULQ R14, DX

 	// clear the flags
 	XORQ AX, AX

 	// C,_ := t[0] + m*q[0]
 	MULXQ q<>+0(SB), AX, R12
 	ADCXQ R14, AX
 	MOVQ  R12, R14

 	// (C,t[0]) := t[1] + m*q[1] + C
 	ADCXQ R15, R14
 	MULXQ q<>+8(SB), AX, R15
 	ADOXQ AX, R14

 	// (C,t[1]) := t[2] + m*q[2] + C
 	ADCXQ CX, R15
 	MULXQ q<>+16(SB), AX, CX
 	ADOXQ AX, R15

 	// (C,t[2]) := t[3] + m*q[3] + C
 	ADCXQ BX, CX
 	MULXQ q<>+24(SB), AX, BX
 	ADOXQ AX, CX

 	// t[3] = C + A
 	MOVQ  $0, AX
 	ADCXQ AX, BX
 	ADOXQ BP, BX

 	// clear the flags
 	XORQ AX, AX
 	MOVQ 24(R11), DX

 	// (A,t[0])  := t[0] + x[0]*y[3] + A
 	MULXQ DI, AX, BP
 	ADOXQ AX, R14

 	// (A,t[1])  := t[1] + x[1]*y[3] + A
 	ADCXQ BP, R15
 	MULXQ R8, AX, BP
 	ADOXQ AX, R15

 	// (A,t[2])  := t[2] + x[2]*y[3] + A
 	ADCXQ BP, CX
 	MULXQ R9, AX, BP
 	ADOXQ AX, CX

 	// (A,t[3])  := t[3] + x[3]*y[3] + A
 	ADCXQ BP, BX
 	MULXQ R10, AX, BP
 	ADOXQ AX, BX

 	// A += carries from ADCXQ and ADOXQ
 	MOVQ  $0, AX
 	ADCXQ AX, BP
 	ADOXQ AX, BP

 	// m := t[0]*q'[0] mod W
 	MOVQ  qInv0<>(SB), DX
 	IMULQ R14, DX

 	// clear the flags
 	XORQ AX, AX

 	// C,_ := t[0] + m*q[0]
 	MULXQ q<>+0(SB), AX, R12
 	ADCXQ R14, AX
 	MOVQ  R12, R14

 	// (C,t[0]) := t[1] + m*q[1] + C
 	ADCXQ R15, R14
 	MULXQ q<>+8(SB), AX, R15
 	ADOXQ AX, R14

 	// (C,t[1]) := t[2] + m*q[2] + C
 	ADCXQ CX, R15
 	MULXQ q<>+16(SB), AX, CX
 	ADOXQ AX, R15

 	// (C,t[2]) := t[3] + m*q[3] + C
 	ADCXQ BX, CX
 	MULXQ q<>+24(SB), AX, BX
 	ADOXQ AX, CX

 	// t[3] = C + A
 	MOVQ  $0, AX
 	ADCXQ AX, BX
 	ADOXQ BP, BX

 	// reduce element(R14,R15,CX,BX) using temp registers (R13,SI,R12,R11)
 	REDUCE(R14,R15,CX,BX,R13,SI,R12,R11)

 	MOVQ res+0(FP), AX
 	MOVQ R14, 0(AX)
 	MOVQ R15, 8(AX)
 	MOVQ CX, 16(AX)
 	MOVQ BX, 24(AX)
 	RET

 TEXT ·fromMont(SB), NOSPLIT, $0-8

 	// the algorithm is described here
 	// https://hackmd.io/@zkteam/modular_multiplication
 	// when y = 1 we have:
 	// for i=0 to N-1
 	// 		t[i] = x[i]
 	// for i=0 to N-1
 	// 		m := t[0]*q'[0] mod W
 	// 		C,_ := t[0] + m*q[0]
 	// 		for j=1 to N-1
 	// 		    (C,t[j-1]) := t[j] + m*q[j] + C
 	// 		t[N-1] = C
 	MOVQ res+0(FP), DX
 	MOVQ 0(DX), R14
 	MOVQ 8(DX), R15
 	MOVQ 16(DX), CX
 	MOVQ 24(DX), BX
 	XORQ DX, DX

 	// m := t[0]*q'[0] mod W
 	MOVQ  qInv0<>(SB), DX
 	IMULQ R14, DX
 	XORQ  AX, AX

 	// C,_ := t[0] + m*q[0]
 	MULXQ q<>+0(SB), AX, BP
 	ADCXQ R14, AX
 	MOVQ  BP, R14

 	// (C,t[0]) := t[1] + m*q[1] + C
 	ADCXQ R15, R14
 	MULXQ q<>+8(SB), AX, R15
 	ADOXQ AX, R14

 	// (C,t[1]) := t[2] + m*q[2] + C
 	ADCXQ CX, R15
 	MULXQ q<>+16(SB), AX, CX
 	ADOXQ AX, R15

 	// (C,t[2]) := t[3] + m*q[3] + C
 	ADCXQ BX, CX
 	MULXQ q<>+24(SB), AX, BX
 	ADOXQ AX, CX
 	MOVQ  $0, AX
 	ADCXQ AX, BX
 	ADOXQ AX, BX
 	XORQ  DX, DX

 	// m := t[0]*q'[0] mod W
 	MOVQ  qInv0<>(SB), DX
 	IMULQ R14, DX
 	XORQ  AX, AX

 	// C,_ := t[0] + m*q[0]
 	MULXQ q<>+0(SB), AX, BP
 	ADCXQ R14, AX
 	MOVQ  BP, R14

 	// (C,t[0]) := t[1] + m*q[1] + C
 	ADCXQ R15, R14
 	MULXQ q<>+8(SB), AX, R15
 	ADOXQ AX, R14

 	// (C,t[1]) := t[2] + m*q[2] + C
 	ADCXQ CX, R15
 	MULXQ q<>+16(SB), AX, CX
 	ADOXQ AX, R15

 	// (C,t[2]) := t[3] + m*q[3] + C
 	ADCXQ BX, CX
 	MULXQ q<>+24(SB), AX, BX
 	ADOXQ AX, CX
 	MOVQ  $0, AX
 	ADCXQ AX, BX
 	ADOXQ AX, BX
 	XORQ  DX, DX

 	// m := t[0]*q'[0] mod W
 	MOVQ  qInv0<>(SB), DX
 	IMULQ R14, DX
 	XORQ  AX, AX

 	// C,_ := t[0] + m*q[0]
 	MULXQ q<>+0(SB), AX, BP
 	ADCXQ R14, AX
 	MOVQ  BP, R14

 	// (C,t[0]) := t[1] + m*q[1] + C
 	ADCXQ R15, R14
 	MULXQ q<>+8(SB), AX, R15
 	ADOXQ AX, R14

 	// (C,t[1]) := t[2] + m*q[2] + C
 	ADCXQ CX, R15
 	MULXQ q<>+16(SB), AX, CX
 	ADOXQ AX, R15

 	// (C,t[2]) := t[3] + m*q[3] + C
 	ADCXQ BX, CX
 	MULXQ q<>+24(SB), AX, BX
 	ADOXQ AX, CX
 	MOVQ  $0, AX
 	ADCXQ AX, BX
 	ADOXQ AX, BX
 	XORQ  DX, DX

 	// m := t[0]*q'[0] mod W
 	MOVQ  qInv0<>(SB), DX
 	IMULQ R14, DX
 	XORQ  AX, AX

 	// C,_ := t[0] + m*q[0]
 	MULXQ q<>+0(SB), AX, BP
 	ADCXQ R14, AX
 	MOVQ  BP, R14

 	// (C,t[0]) := t[1] + m*q[1] + C
 	ADCXQ R15, R14
 	MULXQ q<>+8(SB), AX, R15
 	ADOXQ AX, R14

 	// (C,t[1]) := t[2] + m*q[2] + C
 	ADCXQ CX, R15
 	MULXQ q<>+16(SB), AX, CX
 	ADOXQ AX, R15

 	// (C,t[2]) := t[3] + m*q[3] + C
 	ADCXQ BX, CX
 	MULXQ q<>+24(SB), AX, BX
 	ADOXQ AX, CX
 	MOVQ  $0, AX
 	ADCXQ AX, BX
 	ADOXQ AX, BX

 	// reduce element(R14,R15,CX,BX) using temp registers (SI,DI,R8,R9)
 	REDUCE(R14,R15,CX,BX,SI,DI,R8,R9)

 	MOVQ res+0(FP), AX
 	MOVQ R14, 0(AX)
 	MOVQ R15, 8(AX)
 	MOVQ CX, 16(AX)
 	MOVQ BX, 24(AX)
 	RET
--- a/ff/element_mul_amd64.s
+++ b/ff/element_mul_amd64.s
@ -0,0 +1,488 @@
 // +build !amd64_adx

 // Copyright 2020 ConsenSys Software Inc.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.

 #include "textflag.h"
 #include "funcdata.h"

 // modulus q
 DATA q<>+0(SB)/8, $0x43e1f593f0000001
 DATA q<>+8(SB)/8, $0x2833e84879b97091
 DATA q<>+16(SB)/8, $0xb85045b68181585d
 DATA q<>+24(SB)/8, $0x30644e72e131a029
 GLOBL q<>(SB), (RODATA+NOPTR), $32

 // qInv0 q'[0]
 DATA qInv0<>(SB)/8, $0xc2e1f593efffffff
 GLOBL qInv0<>(SB), (RODATA+NOPTR), $8

 #define REDUCE(ra0, ra1, ra2, ra3, rb0, rb1, rb2, rb3) \
 	MOVQ    ra0, rb0;        \
 	SUBQ    q<>(SB), ra0;    \
 	MOVQ    ra1, rb1;        \
 	SBBQ    q<>+8(SB), ra1;  \
 	MOVQ    ra2, rb2;        \
 	SBBQ    q<>+16(SB), ra2; \
 	MOVQ    ra3, rb3;        \
 	SBBQ    q<>+24(SB), ra3; \
 	CMOVQCS rb0, ra0;        \
 	CMOVQCS rb1, ra1;        \
 	CMOVQCS rb2, ra2;        \
 	CMOVQCS rb3, ra3;        \

 // mul(res, x, y *Element)
 TEXT ·mul(SB), $24-24

 	// the algorithm is described here
 	// https://hackmd.io/@zkteam/modular_multiplication
 	// however, to benefit from the ADCX and ADOX carry chains
 	// we split the inner loops in 2:
 	// for i=0 to N-1
 	// 		for j=0 to N-1
 	// 		    (A,t[j])  := t[j] + x[j]*y[i] + A
 	// 		m := t[0]*q'[0] mod W
 	// 		C,_ := t[0] + m*q[0]
 	// 		for j=1 to N-1
 	// 		    (C,t[j-1]) := t[j] + m*q[j] + C
 	// 		t[N-1] = C + A

 	NO_LOCAL_POINTERS
 	CMPB ·supportAdx(SB), $1
 	JNE  l1
 	MOVQ x+8(FP), SI

 	// x[0] -> DI
 	// x[1] -> R8
 	// x[2] -> R9
 	// x[3] -> R10
 	MOVQ 0(SI), DI
 	MOVQ 8(SI), R8
 	MOVQ 16(SI), R9
 	MOVQ 24(SI), R10
 	MOVQ y+16(FP), R11

 	// A -> BP
 	// t[0] -> R14
 	// t[1] -> R15
 	// t[2] -> CX
 	// t[3] -> BX
 	// clear the flags
 	XORQ AX, AX
 	MOVQ 0(R11), DX

 	// (A,t[0])  := x[0]*y[0] + A
 	MULXQ DI, R14, R15

 	// (A,t[1])  := x[1]*y[0] + A
 	MULXQ R8, AX, CX
 	ADOXQ AX, R15

 	// (A,t[2])  := x[2]*y[0] + A
 	MULXQ R9, AX, BX
 	ADOXQ AX, CX

 	// (A,t[3])  := x[3]*y[0] + A
 	MULXQ R10, AX, BP
 	ADOXQ AX, BX

 	// A += carries from ADCXQ and ADOXQ
 	MOVQ  $0, AX
 	ADOXQ AX, BP

 	// m := t[0]*q'[0] mod W
 	MOVQ  qInv0<>(SB), DX
 	IMULQ R14, DX

 	// clear the flags
 	XORQ AX, AX

 	// C,_ := t[0] + m*q[0]
 	MULXQ q<>+0(SB), AX, R12
 	ADCXQ R14, AX
 	MOVQ  R12, R14

 	// (C,t[0]) := t[1] + m*q[1] + C
 	ADCXQ R15, R14
 	MULXQ q<>+8(SB), AX, R15
 	ADOXQ AX, R14

 	// (C,t[1]) := t[2] + m*q[2] + C
 	ADCXQ CX, R15
 	MULXQ q<>+16(SB), AX, CX
 	ADOXQ AX, R15

 	// (C,t[2]) := t[3] + m*q[3] + C
 	ADCXQ BX, CX
 	MULXQ q<>+24(SB), AX, BX
 	ADOXQ AX, CX

 	// t[3] = C + A
 	MOVQ  $0, AX
 	ADCXQ AX, BX
 	ADOXQ BP, BX

 	// clear the flags
 	XORQ AX, AX
 	MOVQ 8(R11), DX

 	// (A,t[0])  := t[0] + x[0]*y[1] + A
 	MULXQ DI, AX, BP
 	ADOXQ AX, R14

 	// (A,t[1])  := t[1] + x[1]*y[1] + A
 	ADCXQ BP, R15
 	MULXQ R8, AX, BP
 	ADOXQ AX, R15

 	// (A,t[2])  := t[2] + x[2]*y[1] + A
 	ADCXQ BP, CX
 	MULXQ R9, AX, BP
 	ADOXQ AX, CX

 	// (A,t[3])  := t[3] + x[3]*y[1] + A
 	ADCXQ BP, BX
 	MULXQ R10, AX, BP
 	ADOXQ AX, BX

 	// A += carries from ADCXQ and ADOXQ
 	MOVQ  $0, AX
 	ADCXQ AX, BP
 	ADOXQ AX, BP

 	// m := t[0]*q'[0] mod W
 	MOVQ  qInv0<>(SB), DX
 	IMULQ R14, DX

 	// clear the flags
 	XORQ AX, AX

 	// C,_ := t[0] + m*q[0]
 	MULXQ q<>+0(SB), AX, R12
 	ADCXQ R14, AX
 	MOVQ  R12, R14

 	// (C,t[0]) := t[1] + m*q[1] + C
 	ADCXQ R15, R14
 	MULXQ q<>+8(SB), AX, R15
 	ADOXQ AX, R14

 	// (C,t[1]) := t[2] + m*q[2] + C
 	ADCXQ CX, R15
 	MULXQ q<>+16(SB), AX, CX
 	ADOXQ AX, R15

 	// (C,t[2]) := t[3] + m*q[3] + C
 	ADCXQ BX, CX
 	MULXQ q<>+24(SB), AX, BX
 	ADOXQ AX, CX

 	// t[3] = C + A
 	MOVQ  $0, AX
 	ADCXQ AX, BX
 	ADOXQ BP, BX

 	// clear the flags
 	XORQ AX, AX
 	MOVQ 16(R11), DX

 	// (A,t[0])  := t[0] + x[0]*y[2] + A
 	MULXQ DI, AX, BP
 	ADOXQ AX, R14

 	// (A,t[1])  := t[1] + x[1]*y[2] + A
 	ADCXQ BP, R15
 	MULXQ R8, AX, BP
 	ADOXQ AX, R15

 	// (A,t[2])  := t[2] + x[2]*y[2] + A
 	ADCXQ BP, CX
 	MULXQ R9, AX, BP
 	ADOXQ AX, CX

 	// (A,t[3])  := t[3] + x[3]*y[2] + A
 	ADCXQ BP, BX
 	MULXQ R10, AX, BP
 	ADOXQ AX, BX

 	// A += carries from ADCXQ and ADOXQ
 	MOVQ  $0, AX
 	ADCXQ AX, BP
 	ADOXQ AX, BP

 	// m := t[0]*q'[0] mod W
 	MOVQ  qInv0<>(SB), DX
 	IMULQ R14, DX

 	// clear the flags
 	XORQ AX, AX

 	// C,_ := t[0] + m*q[0]
 	MULXQ q<>+0(SB), AX, R12
 	ADCXQ R14, AX
 	MOVQ  R12, R14

 	// (C,t[0]) := t[1] + m*q[1] + C
 	ADCXQ R15, R14
 	MULXQ q<>+8(SB), AX, R15
 	ADOXQ AX, R14

 	// (C,t[1]) := t[2] + m*q[2] + C
 	ADCXQ CX, R15
 	MULXQ q<>+16(SB), AX, CX
 	ADOXQ AX, R15

 	// (C,t[2]) := t[3] + m*q[3] + C
 	ADCXQ BX, CX
 	MULXQ q<>+24(SB), AX, BX
 	ADOXQ AX, CX

 	// t[3] = C + A
 	MOVQ  $0, AX
 	ADCXQ AX, BX
 	ADOXQ BP, BX

 	// clear the flags
 	XORQ AX, AX
 	MOVQ 24(R11), DX

 	// (A,t[0])  := t[0] + x[0]*y[3] + A
 	MULXQ DI, AX, BP
 	ADOXQ AX, R14

 	// (A,t[1])  := t[1] + x[1]*y[3] + A
 	ADCXQ BP, R15
 	MULXQ R8, AX, BP
 	ADOXQ AX, R15

 	// (A,t[2])  := t[2] + x[2]*y[3] + A
 	ADCXQ BP, CX
 	MULXQ R9, AX, BP
 	ADOXQ AX, CX

 	// (A,t[3])  := t[3] + x[3]*y[3] + A
 	ADCXQ BP, BX
 	MULXQ R10, AX, BP
 	ADOXQ AX, BX

 	// A += carries from ADCXQ and ADOXQ
 	MOVQ  $0, AX
 	ADCXQ AX, BP
 	ADOXQ AX, BP

 	// m := t[0]*q'[0] mod W
 	MOVQ  qInv0<>(SB), DX
 	IMULQ R14, DX

 	// clear the flags
 	XORQ AX, AX

 	// C,_ := t[0] + m*q[0]
 	MULXQ q<>+0(SB), AX, R12
 	ADCXQ R14, AX
 	MOVQ  R12, R14

 	// (C,t[0]) := t[1] + m*q[1] + C
 	ADCXQ R15, R14
 	MULXQ q<>+8(SB), AX, R15
 	ADOXQ AX, R14

 	// (C,t[1]) := t[2] + m*q[2] + C
 	ADCXQ CX, R15
 	MULXQ q<>+16(SB), AX, CX
 	ADOXQ AX, R15

 	// (C,t[2]) := t[3] + m*q[3] + C
 	ADCXQ BX, CX
 	MULXQ q<>+24(SB), AX, BX
 	ADOXQ AX, CX

 	// t[3] = C + A
 	MOVQ  $0, AX
 	ADCXQ AX, BX
 	ADOXQ BP, BX

 	// reduce element(R14,R15,CX,BX) using temp registers (R13,SI,R12,R11)
 	REDUCE(R14,R15,CX,BX,R13,SI,R12,R11)

 	MOVQ res+0(FP), AX
 	MOVQ R14, 0(AX)
 	MOVQ R15, 8(AX)
 	MOVQ CX, 16(AX)
 	MOVQ BX, 24(AX)
 	RET

 l1:
 	MOVQ res+0(FP), AX
 	MOVQ AX, (SP)
 	MOVQ x+8(FP), AX
 	MOVQ AX, 8(SP)
 	MOVQ y+16(FP), AX
 	MOVQ AX, 16(SP)
 	CALL ·_mulGeneric(SB)
 	RET

 TEXT ·fromMont(SB), $8-8
 	NO_LOCAL_POINTERS

 	// the algorithm is described here
 	// https://hackmd.io/@zkteam/modular_multiplication
 	// when y = 1 we have:
 	// for i=0 to N-1
 	// 		t[i] = x[i]
 	// for i=0 to N-1
 	// 		m := t[0]*q'[0] mod W
 	// 		C,_ := t[0] + m*q[0]
 	// 		for j=1 to N-1
 	// 		    (C,t[j-1]) := t[j] + m*q[j] + C
 	// 		t[N-1] = C
 	CMPB ·supportAdx(SB), $1
 	JNE  l2
 	MOVQ res+0(FP), DX
 	MOVQ 0(DX), R14
 	MOVQ 8(DX), R15
 	MOVQ 16(DX), CX
 	MOVQ 24(DX), BX
 	XORQ DX, DX

 	// m := t[0]*q'[0] mod W
 	MOVQ  qInv0<>(SB), DX
 	IMULQ R14, DX
 	XORQ  AX, AX

 	// C,_ := t[0] + m*q[0]
 	MULXQ q<>+0(SB), AX, BP
 	ADCXQ R14, AX
 	MOVQ  BP, R14

 	// (C,t[0]) := t[1] + m*q[1] + C
 	ADCXQ R15, R14
 	MULXQ q<>+8(SB), AX, R15
 	ADOXQ AX, R14

 	// (C,t[1]) := t[2] + m*q[2] + C
 	ADCXQ CX, R15
 	MULXQ q<>+16(SB), AX, CX
 	ADOXQ AX, R15

 	// (C,t[2]) := t[3] + m*q[3] + C
 	ADCXQ BX, CX
 	MULXQ q<>+24(SB), AX, BX
 	ADOXQ AX, CX
 	MOVQ  $0, AX
 	ADCXQ AX, BX
 	ADOXQ AX, BX
 	XORQ  DX, DX

 	// m := t[0]*q'[0] mod W
 	MOVQ  qInv0<>(SB), DX
 	IMULQ R14, DX
 	XORQ  AX, AX

 	// C,_ := t[0] + m*q[0]
 	MULXQ q<>+0(SB), AX, BP
 	ADCXQ R14, AX
 	MOVQ  BP, R14

 	// (C,t[0]) := t[1] + m*q[1] + C
 	ADCXQ R15, R14
 	MULXQ q<>+8(SB), AX, R15
 	ADOXQ AX, R14

 	// (C,t[1]) := t[2] + m*q[2] + C
 	ADCXQ CX, R15
 	MULXQ q<>+16(SB), AX, CX
 	ADOXQ AX, R15

 	// (C,t[2]) := t[3] + m*q[3] + C
 	ADCXQ BX, CX
 	MULXQ q<>+24(SB), AX, BX
 	ADOXQ AX, CX
 	MOVQ  $0, AX
 	ADCXQ AX, BX
 	ADOXQ AX, BX
 	XORQ  DX, DX

 	// m := t[0]*q'[0] mod W
 	MOVQ  qInv0<>(SB), DX
 	IMULQ R14, DX
 	XORQ  AX, AX

 	// C,_ := t[0] + m*q[0]
 	MULXQ q<>+0(SB), AX, BP
 	ADCXQ R14, AX
 	MOVQ  BP, R14

 	// (C,t[0]) := t[1] + m*q[1] + C
 	ADCXQ R15, R14
 	MULXQ q<>+8(SB), AX, R15
 	ADOXQ AX, R14

 	// (C,t[1]) := t[2] + m*q[2] + C
 	ADCXQ CX, R15
 	MULXQ q<>+16(SB), AX, CX
 	ADOXQ AX, R15

 	// (C,t[2]) := t[3] + m*q[3] + C
 	ADCXQ BX, CX
 	MULXQ q<>+24(SB), AX, BX
 	ADOXQ AX, CX
 	MOVQ  $0, AX
 	ADCXQ AX, BX
 	ADOXQ AX, BX
 	XORQ  DX, DX

 	// m := t[0]*q'[0] mod W
 	MOVQ  qInv0<>(SB), DX
 	IMULQ R14, DX
 	XORQ  AX, AX

 	// C,_ := t[0] + m*q[0]
 	MULXQ q<>+0(SB), AX, BP
 	ADCXQ R14, AX
 	MOVQ  BP, R14

 	// (C,t[0]) := t[1] + m*q[1] + C
 	ADCXQ R15, R14
 	MULXQ q<>+8(SB), AX, R15
 	ADOXQ AX, R14

 	// (C,t[1]) := t[2] + m*q[2] + C
 	ADCXQ CX, R15
 	MULXQ q<>+16(SB), AX, CX
 	ADOXQ AX, R15

 	// (C,t[2]) := t[3] + m*q[3] + C
 	ADCXQ BX, CX
 	MULXQ q<>+24(SB), AX, BX
 	ADOXQ AX, CX
 	MOVQ  $0, AX
 	ADCXQ AX, BX
 	ADOXQ AX, BX

 	// reduce element(R14,R15,CX,BX) using temp registers (SI,DI,R8,R9)
 	REDUCE(R14,R15,CX,BX,SI,DI,R8,R9)

 	MOVQ res+0(FP), AX
 	MOVQ R14, 0(AX)
 	MOVQ R15, 8(AX)
 	MOVQ CX, 16(AX)
 	MOVQ BX, 24(AX)
 	RET

 l2:
 	MOVQ res+0(FP), AX
 	MOVQ AX, (SP)
 	CALL ·_fromMontGeneric(SB)
 	RET
--- a/ff/element_ops_amd64.go
+++ b/ff/element_ops_amd64.go
@ -0,0 +1,50 @@
 // Copyright 2020 ConsenSys Software Inc.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.

 // Code generated by consensys/gnark-crypto DO NOT EDIT

 package ff

 //go:noescape
 func MulBy3(x *Element)

 //go:noescape
 func MulBy5(x *Element)

 //go:noescape
 func MulBy13(x *Element)

 //go:noescape
 func add(res, x, y *Element)

 //go:noescape
 func sub(res, x, y *Element)

 //go:noescape
 func neg(res, x *Element)

 //go:noescape
 func double(res, x *Element)

 //go:noescape
 func mul(res, x, y *Element)

 //go:noescape
 func fromMont(res *Element)

 //go:noescape
 func reduce(res *Element)

 //go:noescape
 func Butterfly(a, b *Element)
--- a/ff/element_ops_amd64.s
+++ b/ff/element_ops_amd64.s
@ -0,0 +1,340 @@
 // Copyright 2020 ConsenSys Software Inc.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.

 #include "textflag.h"
 #include "funcdata.h"

 // modulus q
 DATA q<>+0(SB)/8, $0x43e1f593f0000001
 DATA q<>+8(SB)/8, $0x2833e84879b97091
 DATA q<>+16(SB)/8, $0xb85045b68181585d
 DATA q<>+24(SB)/8, $0x30644e72e131a029
 GLOBL q<>(SB), (RODATA+NOPTR), $32

 // qInv0 q'[0]
 DATA qInv0<>(SB)/8, $0xc2e1f593efffffff
 GLOBL qInv0<>(SB), (RODATA+NOPTR), $8

 #define REDUCE(ra0, ra1, ra2, ra3, rb0, rb1, rb2, rb3) \
 	MOVQ    ra0, rb0;        \
 	SUBQ    q<>(SB), ra0;    \
 	MOVQ    ra1, rb1;        \
 	SBBQ    q<>+8(SB), ra1;  \
 	MOVQ    ra2, rb2;        \
 	SBBQ    q<>+16(SB), ra2; \
 	MOVQ    ra3, rb3;        \
 	SBBQ    q<>+24(SB), ra3; \
 	CMOVQCS rb0, ra0;        \
 	CMOVQCS rb1, ra1;        \
 	CMOVQCS rb2, ra2;        \
 	CMOVQCS rb3, ra3;        \

 // add(res, x, y *Element)
 TEXT ·add(SB), NOSPLIT, $0-24
 	MOVQ x+8(FP), AX
 	MOVQ 0(AX), CX
 	MOVQ 8(AX), BX
 	MOVQ 16(AX), SI
 	MOVQ 24(AX), DI
 	MOVQ y+16(FP), DX
 	ADDQ 0(DX), CX
 	ADCQ 8(DX), BX
 	ADCQ 16(DX), SI
 	ADCQ 24(DX), DI

 	// reduce element(CX,BX,SI,DI) using temp registers (R8,R9,R10,R11)
 	REDUCE(CX,BX,SI,DI,R8,R9,R10,R11)

 	MOVQ res+0(FP), R12
 	MOVQ CX, 0(R12)
 	MOVQ BX, 8(R12)
 	MOVQ SI, 16(R12)
 	MOVQ DI, 24(R12)
 	RET

 // sub(res, x, y *Element)
 TEXT ·sub(SB), NOSPLIT, $0-24
 	XORQ    DI, DI
 	MOVQ    x+8(FP), SI
 	MOVQ    0(SI), AX
 	MOVQ    8(SI), DX
 	MOVQ    16(SI), CX
 	MOVQ    24(SI), BX
 	MOVQ    y+16(FP), SI
 	SUBQ    0(SI), AX
 	SBBQ    8(SI), DX
 	SBBQ    16(SI), CX
 	SBBQ    24(SI), BX
 	MOVQ    $0x43e1f593f0000001, R8
 	MOVQ    $0x2833e84879b97091, R9
 	MOVQ    $0xb85045b68181585d, R10
 	MOVQ    $0x30644e72e131a029, R11
 	CMOVQCC DI, R8
 	CMOVQCC DI, R9
 	CMOVQCC DI, R10
 	CMOVQCC DI, R11
 	ADDQ    R8, AX
 	ADCQ    R9, DX
 	ADCQ    R10, CX
 	ADCQ    R11, BX
 	MOVQ    res+0(FP), R12
 	MOVQ    AX, 0(R12)
 	MOVQ    DX, 8(R12)
 	MOVQ    CX, 16(R12)
 	MOVQ    BX, 24(R12)
 	RET

 // double(res, x *Element)
 TEXT ·double(SB), NOSPLIT, $0-16
 	MOVQ x+8(FP), AX
 	MOVQ 0(AX), DX
 	MOVQ 8(AX), CX
 	MOVQ 16(AX), BX
 	MOVQ 24(AX), SI
 	ADDQ DX, DX
 	ADCQ CX, CX
 	ADCQ BX, BX
 	ADCQ SI, SI

 	// reduce element(DX,CX,BX,SI) using temp registers (DI,R8,R9,R10)
 	REDUCE(DX,CX,BX,SI,DI,R8,R9,R10)

 	MOVQ res+0(FP), R11
 	MOVQ DX, 0(R11)
 	MOVQ CX, 8(R11)
 	MOVQ BX, 16(R11)
 	MOVQ SI, 24(R11)
 	RET

 // neg(res, x *Element)
 TEXT ·neg(SB), NOSPLIT, $0-16
 	MOVQ  res+0(FP), DI
 	MOVQ  x+8(FP), AX
 	MOVQ  0(AX), DX
 	MOVQ  8(AX), CX
 	MOVQ  16(AX), BX
 	MOVQ  24(AX), SI
 	MOVQ  DX, AX
 	ORQ   CX, AX
 	ORQ   BX, AX
 	ORQ   SI, AX
 	TESTQ AX, AX
 	JEQ   l1
 	MOVQ  $0x43e1f593f0000001, R8
 	SUBQ  DX, R8
 	MOVQ  R8, 0(DI)
 	MOVQ  $0x2833e84879b97091, R8
 	SBBQ  CX, R8
 	MOVQ  R8, 8(DI)
 	MOVQ  $0xb85045b68181585d, R8
 	SBBQ  BX, R8
 	MOVQ  R8, 16(DI)
 	MOVQ  $0x30644e72e131a029, R8
 	SBBQ  SI, R8
 	MOVQ  R8, 24(DI)
 	RET

 l1:
 	MOVQ AX, 0(DI)
 	MOVQ AX, 8(DI)
 	MOVQ AX, 16(DI)
 	MOVQ AX, 24(DI)
 	RET

 TEXT ·reduce(SB), NOSPLIT, $0-8
 	MOVQ res+0(FP), AX
 	MOVQ 0(AX), DX
 	MOVQ 8(AX), CX
 	MOVQ 16(AX), BX
 	MOVQ 24(AX), SI

 	// reduce element(DX,CX,BX,SI) using temp registers (DI,R8,R9,R10)
 	REDUCE(DX,CX,BX,SI,DI,R8,R9,R10)

 	MOVQ DX, 0(AX)
 	MOVQ CX, 8(AX)
 	MOVQ BX, 16(AX)
 	MOVQ SI, 24(AX)
 	RET

 // MulBy3(x *Element)
 TEXT ·MulBy3(SB), NOSPLIT, $0-8
 	MOVQ x+0(FP), AX
 	MOVQ 0(AX), DX
 	MOVQ 8(AX), CX
 	MOVQ 16(AX), BX
 	MOVQ 24(AX), SI
 	ADDQ DX, DX
 	ADCQ CX, CX
 	ADCQ BX, BX
 	ADCQ SI, SI

 	// reduce element(DX,CX,BX,SI) using temp registers (DI,R8,R9,R10)
 	REDUCE(DX,CX,BX,SI,DI,R8,R9,R10)

 	ADDQ 0(AX), DX
 	ADCQ 8(AX), CX
 	ADCQ 16(AX), BX
 	ADCQ 24(AX), SI

 	// reduce element(DX,CX,BX,SI) using temp registers (R11,R12,R13,R14)
 	REDUCE(DX,CX,BX,SI,R11,R12,R13,R14)

 	MOVQ DX, 0(AX)
 	MOVQ CX, 8(AX)
 	MOVQ BX, 16(AX)
 	MOVQ SI, 24(AX)
 	RET

 // MulBy5(x *Element)
 TEXT ·MulBy5(SB), NOSPLIT, $0-8
 	MOVQ x+0(FP), AX
 	MOVQ 0(AX), DX
 	MOVQ 8(AX), CX
 	MOVQ 16(AX), BX
 	MOVQ 24(AX), SI
 	ADDQ DX, DX
 	ADCQ CX, CX
 	ADCQ BX, BX
 	ADCQ SI, SI

 	// reduce element(DX,CX,BX,SI) using temp registers (DI,R8,R9,R10)
 	REDUCE(DX,CX,BX,SI,DI,R8,R9,R10)

 	ADDQ DX, DX
 	ADCQ CX, CX
 	ADCQ BX, BX
 	ADCQ SI, SI

 	// reduce element(DX,CX,BX,SI) using temp registers (R11,R12,R13,R14)
 	REDUCE(DX,CX,BX,SI,R11,R12,R13,R14)

 	ADDQ 0(AX), DX
 	ADCQ 8(AX), CX
 	ADCQ 16(AX), BX
 	ADCQ 24(AX), SI

 	// reduce element(DX,CX,BX,SI) using temp registers (R15,DI,R8,R9)
 	REDUCE(DX,CX,BX,SI,R15,DI,R8,R9)

 	MOVQ DX, 0(AX)
 	MOVQ CX, 8(AX)
 	MOVQ BX, 16(AX)
 	MOVQ SI, 24(AX)
 	RET

 // MulBy13(x *Element)
 TEXT ·MulBy13(SB), NOSPLIT, $0-8
 	MOVQ x+0(FP), AX
 	MOVQ 0(AX), DX
 	MOVQ 8(AX), CX
 	MOVQ 16(AX), BX
 	MOVQ 24(AX), SI
 	ADDQ DX, DX
 	ADCQ CX, CX
 	ADCQ BX, BX
 	ADCQ SI, SI

 	// reduce element(DX,CX,BX,SI) using temp registers (DI,R8,R9,R10)
 	REDUCE(DX,CX,BX,SI,DI,R8,R9,R10)

 	ADDQ DX, DX
 	ADCQ CX, CX
 	ADCQ BX, BX
 	ADCQ SI, SI

 	// reduce element(DX,CX,BX,SI) using temp registers (R11,R12,R13,R14)
 	REDUCE(DX,CX,BX,SI,R11,R12,R13,R14)

 	MOVQ DX, R11
 	MOVQ CX, R12
 	MOVQ BX, R13
 	MOVQ SI, R14
 	ADDQ DX, DX
 	ADCQ CX, CX
 	ADCQ BX, BX
 	ADCQ SI, SI

 	// reduce element(DX,CX,BX,SI) using temp registers (DI,R8,R9,R10)
 	REDUCE(DX,CX,BX,SI,DI,R8,R9,R10)

 	ADDQ R11, DX
 	ADCQ R12, CX
 	ADCQ R13, BX
 	ADCQ R14, SI

 	// reduce element(DX,CX,BX,SI) using temp registers (DI,R8,R9,R10)
 	REDUCE(DX,CX,BX,SI,DI,R8,R9,R10)

 	ADDQ 0(AX), DX
 	ADCQ 8(AX), CX
 	ADCQ 16(AX), BX
 	ADCQ 24(AX), SI

 	// reduce element(DX,CX,BX,SI) using temp registers (DI,R8,R9,R10)
 	REDUCE(DX,CX,BX,SI,DI,R8,R9,R10)

 	MOVQ DX, 0(AX)
 	MOVQ CX, 8(AX)
 	MOVQ BX, 16(AX)
 	MOVQ SI, 24(AX)
 	RET

 // Butterfly(a, b *Element) sets a = a + b; b = a - b
 TEXT ·Butterfly(SB), NOSPLIT, $0-16
 	MOVQ    a+0(FP), AX
 	MOVQ    0(AX), CX
 	MOVQ    8(AX), BX
 	MOVQ    16(AX), SI
 	MOVQ    24(AX), DI
 	MOVQ    CX, R8
 	MOVQ    BX, R9
 	MOVQ    SI, R10
 	MOVQ    DI, R11
 	XORQ    AX, AX
 	MOVQ    b+8(FP), DX
 	ADDQ    0(DX), CX
 	ADCQ    8(DX), BX
 	ADCQ    16(DX), SI
 	ADCQ    24(DX), DI
 	SUBQ    0(DX), R8
 	SBBQ    8(DX), R9
 	SBBQ    16(DX), R10
 	SBBQ    24(DX), R11
 	MOVQ    $0x43e1f593f0000001, R12
 	MOVQ    $0x2833e84879b97091, R13
 	MOVQ    $0xb85045b68181585d, R14
 	MOVQ    $0x30644e72e131a029, R15
 	CMOVQCC AX, R12
 	CMOVQCC AX, R13
 	CMOVQCC AX, R14
 	CMOVQCC AX, R15
 	ADDQ    R12, R8
 	ADCQ    R13, R9
 	ADCQ    R14, R10
 	ADCQ    R15, R11
 	MOVQ    R8, 0(DX)
 	MOVQ    R9, 8(DX)
 	MOVQ    R10, 16(DX)
 	MOVQ    R11, 24(DX)

 	// reduce element(CX,BX,SI,DI) using temp registers (R8,R9,R10,R11)
 	REDUCE(CX,BX,SI,DI,R8,R9,R10,R11)

 	MOVQ a+0(FP), AX
 	MOVQ CX, 0(AX)
 	MOVQ BX, 8(AX)
 	MOVQ SI, 16(AX)
 	MOVQ DI, 24(AX)
 	RET
--- a/ff/element_ops_noasm.go
+++ b/ff/element_ops_noasm.go
@ -0,0 +1,78 @@
 //go:build !amd64
 // +build !amd64

 // Copyright 2020 ConsenSys Software Inc.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.

 // Code generated by consensys/gnark-crypto DO NOT EDIT

 package ff

 // /!\ WARNING /!\
 // this code has not been audited and is provided as-is. In particular,
 // there is no security guarantees such as constant time implementation
 // or side-channel attack resistance
 // /!\ WARNING /!\

 // MulBy3 x *= 3
 func MulBy3(x *Element) {
 	mulByConstant(x, 3)
 }

 // MulBy5 x *= 5
 func MulBy5(x *Element) {
 	mulByConstant(x, 5)
 }

 // MulBy13 x *= 13
 func MulBy13(x *Element) {
 	mulByConstant(x, 13)
 }

 // Butterfly sets
 // a = a + b
 // b = a - b
 func Butterfly(a, b *Element) {
 	_butterflyGeneric(a, b)
 }

 func mul(z, x, y *Element) {
 	_mulGeneric(z, x, y)
 }

 // FromMont converts z in place (i.e. mutates) from Montgomery to regular representation
 // sets and returns z = z * 1
 func fromMont(z *Element) {
 	_fromMontGeneric(z)
 }

 func add(z, x, y *Element) {
 	_addGeneric(z, x, y)
 }

 func double(z, x *Element) {
 	_doubleGeneric(z, x)
 }

 func sub(z, x, y *Element) {
 	_subGeneric(z, x, y)
 }

 func neg(z, x *Element) {
 	_negGeneric(z, x)
 }

 func reduce(z *Element) {
 	_reduceGeneric(z)
 }
--- a/ff/element_test.go
+++ b/ff/element_test.go
--- a/ff/util.go
+++ b/ff/util.go
@ -1,6 +0,0 @@
 package ff

 // NewElement returns a new empty *Element
 func NewElement() *Element {
 	return &Element{}
 }
--- a/go.mod
+++ b/go.mod
@ -6,4 +6,9 @@ require (
 	github.com/dchest/blake512 v1.0.0
 	github.com/stretchr/testify v1.7.0
 	golang.org/x/crypto v0.0.0-20211117183948-ae814b36b871
 	golang.org/x/sys v0.0.0-20211216021012-1d35b9e2eb4e // indirect
 	github.com/davecgh/go-spew v1.1.0 // indirect
 	github.com/leanovate/gopter v0.2.9 // indirect
 	github.com/pmezard/go-difflib v1.0.0 // indirect
 	gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c // indirect
 )
--- a/go.sum
+++ b/go.sum
@ -2,6 +2,8 @@ github.com/davecgh/go-spew v1.1.0 h1:ZDRjVQ15GmhC3fiQ8ni8+OwkZQO4DARzQgrnXU1Liz8
 github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
 github.com/dchest/blake512 v1.0.0 h1:oDFEQFIqFSeuA34xLtXZ/rWxCXdSjirjzPhey5EUvmA=
 github.com/dchest/blake512 v1.0.0/go.mod h1:FV1x7xPPLWukZlpDpWQ88rF/SFwZ5qbskrzhLMB92JI=
 github.com/leanovate/gopter v0.2.9 h1:fQjYxZaynp97ozCzfOyOuAGOU4aU/z37zf/tOujFk7c=
 github.com/leanovate/gopter v0.2.9/go.mod h1:U2L/78B+KVFIx2VmW6onHJQzXtFb+p5y3y2Sh+Jxxv8=
 github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
 github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
 github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
@ -14,6 +16,8 @@ golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7w
 golang.org/x/sys v0.0.0-20210423082822-04245dca01da/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
 golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1 h1:SrN+KX8Art/Sf4HNj6Zcz06G7VEz+7w9tdXTPOZ7+l4=
 golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
 golang.org/x/sys v0.0.0-20211216021012-1d35b9e2eb4e h1:fLOSk5Q00efkSvAm+4xcoXD+RRmLmmulPn5I3Y9F2EM=
 golang.org/x/sys v0.0.0-20211216021012-1d35b9e2eb4e/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
 golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
 golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
 golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
--- a/poseidon/poseidon.go
+++ b/poseidon/poseidon.go
@ -20,7 +20,7 @@ func zero() *ff.Element {
 // exp5 performs x^5 mod p
 // https://eprint.iacr.org/2019/458.pdf page 8
 func exp5(a *ff.Element) {
 	a.Exp(*a, 5) //nolint:gomnd
 	a.Exp(*a, big.NewInt(5)) //nolint:gomnd
 }

 // exp5state perform exp5 for whole state