Browse Source

Faster ff arithmetics (regenerated code with the newest goff) (#43)

fix/bbjj-err
Oleksandr Brezhniev 2 years ago
committed by GitHub
parent
commit
69354ae29c
No known key found for this signature in database GPG Key ID: 4AEE18F83AFDEB23
17 changed files with 4215 additions and 739 deletions
  1. +7
    -7
      babyjub/babyjub.go
  2. +2
    -64
      ff/arith.go
  3. +24
    -0
      ff/asm.go
  4. +25
    -0
      ff/asm_noadx.go
  5. +43
    -0
      ff/doc.go
  6. +777
    -532
      ff/element.go
  7. +136
    -0
      ff/element_fuzz.go
  8. +466
    -0
      ff/element_mul_adx_amd64.s
  9. +488
    -0
      ff/element_mul_amd64.s
  10. +50
    -0
      ff/element_ops_amd64.go
  11. +340
    -0
      ff/element_ops_amd64.s
  12. +78
    -0
      ff/element_ops_noasm.go
  13. +1769
    -129
      ff/element_test.go
  14. +0
    -6
      ff/util.go
  15. +5
    -0
      go.mod
  16. +4
    -0
      go.sum
  17. +1
    -1
      poseidon/poseidon.go

+ 7
- 7
babyjub/babyjub.go

@ -95,20 +95,20 @@ func (p *PointProjective) Add(q *PointProjective, o *PointProjective) *PointProj
c := ff.NewElement().Mul(q.X, o.X)
d := ff.NewElement().Mul(q.Y, o.Y)
e := ff.NewElement().Mul(Dff, c)
e.MulAssign(d)
e.Mul(e, d)
f := ff.NewElement().Sub(b, e)
g := ff.NewElement().Add(b, e)
x1y1 := ff.NewElement().Add(q.X, q.Y)
x2y2 := ff.NewElement().Add(o.X, o.Y)
x3 := ff.NewElement().Mul(x1y1, x2y2)
x3.SubAssign(c)
x3.SubAssign(d)
x3.MulAssign(a)
x3.MulAssign(f)
x3.Sub(x3, c)
x3.Sub(x3, d)
x3.Mul(x3, a)
x3.Mul(x3, f)
ac := ff.NewElement().Mul(Aff, c)
y3 := ff.NewElement().Sub(d, ac)
y3.MulAssign(a)
y3.MulAssign(g)
y3.Mul(y3, a)
y3.Mul(y3, g)
z3 := ff.NewElement().Mul(f, g)
p.X = x3

+ 2
- 64
ff/arith.go

@ -1,4 +1,4 @@
// Copyright 2020 ConsenSys AG
// Copyright 2020 ConsenSys Software Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
// Code generated by goff DO NOT EDIT
// Code generated by consensys/gnark-crypto DO NOT EDIT
package ff
@ -20,15 +20,6 @@ import (
"math/bits"
)
func madd(a, b, t, u, v uint64) (uint64, uint64, uint64) {
var carry uint64
hi, lo := bits.Mul64(a, b)
v, carry = bits.Add64(lo, v, 0)
u, carry = bits.Add64(hi, u, carry)
t, _ = bits.Add64(t, 0, carry)
return t, u, v
}
// madd0 hi = a*b + c (discards lo bits)
func madd0(a, b, c uint64) (hi uint64) {
var carry, lo uint64
@ -58,59 +49,6 @@ func madd2(a, b, c, d uint64) (hi uint64, lo uint64) {
return
}
// madd2s superhi, hi, lo = 2*a*b + c + d + e
func madd2s(a, b, c, d, e uint64) (superhi, hi, lo uint64) {
var carry, sum uint64
hi, lo = bits.Mul64(a, b)
lo, carry = bits.Add64(lo, lo, 0)
hi, superhi = bits.Add64(hi, hi, carry)
sum, carry = bits.Add64(c, e, 0)
hi, _ = bits.Add64(hi, 0, carry)
lo, carry = bits.Add64(lo, sum, 0)
hi, _ = bits.Add64(hi, 0, carry)
hi, _ = bits.Add64(hi, 0, d)
return
}
func madd1s(a, b, d, e uint64) (superhi, hi, lo uint64) {
var carry uint64
hi, lo = bits.Mul64(a, b)
lo, carry = bits.Add64(lo, lo, 0)
hi, superhi = bits.Add64(hi, hi, carry)
lo, carry = bits.Add64(lo, e, 0)
hi, _ = bits.Add64(hi, 0, carry)
hi, _ = bits.Add64(hi, 0, d)
return
}
func madd2sb(a, b, c, e uint64) (superhi, hi, lo uint64) {
var carry, sum uint64
hi, lo = bits.Mul64(a, b)
lo, carry = bits.Add64(lo, lo, 0)
hi, superhi = bits.Add64(hi, hi, carry)
sum, carry = bits.Add64(c, e, 0)
hi, _ = bits.Add64(hi, 0, carry)
lo, carry = bits.Add64(lo, sum, 0)
hi, _ = bits.Add64(hi, 0, carry)
return
}
func madd1sb(a, b, e uint64) (superhi, hi, lo uint64) {
var carry uint64
hi, lo = bits.Mul64(a, b)
lo, carry = bits.Add64(lo, lo, 0)
hi, superhi = bits.Add64(hi, hi, carry)
lo, carry = bits.Add64(lo, e, 0)
hi, _ = bits.Add64(hi, 0, carry)
return
}
func madd3(a, b, c, d, e uint64) (hi uint64, lo uint64) {
var carry uint64
hi, lo = bits.Mul64(a, b)

+ 24
- 0
ff/asm.go

@ -0,0 +1,24 @@
//go:build !noadx
// +build !noadx
// Copyright 2020 ConsenSys Software Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// Code generated by consensys/gnark-crypto DO NOT EDIT
package ff
import "golang.org/x/sys/cpu"
var supportAdx = cpu.X86.HasADX && cpu.X86.HasBMI2

+ 25
- 0
ff/asm_noadx.go

@ -0,0 +1,25 @@
//go:build noadx
// +build noadx
// Copyright 2020 ConsenSys Software Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// Code generated by consensys/gnark-crypto DO NOT EDIT
package ff
// note: this is needed for test purposes, as dynamically changing supportAdx doesn't flag
// certain errors (like fatal error: missing stackmap)
// this ensures we test all asm path.
var supportAdx = false

+ 43
- 0
ff/doc.go

@ -0,0 +1,43 @@
// Copyright 2020 ConsenSys Software Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// Code generated by consensys/gnark-crypto DO NOT EDIT
// Package ff contains field arithmetic operations for modulus = 0x30644e...000001.
//
// The API is similar to math/big (big.Int), but the operations are significantly faster (up to 20x for the modular multiplication on amd64, see also https://hackmd.io/@zkteam/modular_multiplication)
//
// The modulus is hardcoded in all the operations.
//
// Field elements are represented as an array, and assumed to be in Montgomery form in all methods:
// type Element [4]uint64
//
// Example API signature
// // Mul z = x * y mod q
// func (z *Element) Mul(x, y *Element) *Element
//
// and can be used like so:
// var a, b Element
// a.SetUint64(2)
// b.SetString("984896738")
// a.Mul(a, b)
// a.Sub(a, a)
// .Add(a, b)
// .Inv(a)
// b.Exp(b, new(big.Int).SetUint64(42))
//
// Modulus
// 0x30644e72e131a029b85045b68181585d2833e84879b9709143e1f593f0000001 // base 16
// 21888242871839275222246405745257275088548364400416034343698204186575808495617 // base 10
package ff

+ 777
- 532
ff/element.go
File diff suppressed because it is too large
View File


+ 136
- 0
ff/element_fuzz.go

@ -0,0 +1,136 @@
//go:build gofuzz
// +build gofuzz
// Copyright 2020 ConsenSys Software Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// Code generated by consensys/gnark-crypto DO NOT EDIT
package ff
import (
"bytes"
"encoding/binary"
"io"
"math/big"
"math/bits"
)
const (
fuzzInteresting = 1
fuzzNormal = 0
fuzzDiscard = -1
)
// Fuzz arithmetic operations fuzzer
func Fuzz(data []byte) int {
r := bytes.NewReader(data)
var e1, e2 Element
e1.SetRawBytes(r)
e2.SetRawBytes(r)
{
// mul assembly
var c, _c Element
a, _a, b, _b := e1, e1, e2, e2
c.Mul(&a, &b)
_mulGeneric(&_c, &_a, &_b)
if !c.Equal(&_c) {
panic("mul asm != mul generic on Element")
}
}
{
// inverse
inv := e1
inv.Inverse(&inv)
var bInv, b1, b2 big.Int
e1.ToBigIntRegular(&b1)
bInv.ModInverse(&b1, Modulus())
inv.ToBigIntRegular(&b2)
if b2.Cmp(&bInv) != 0 {
panic("inverse operation doesn't match big int result")
}
}
{
// a + -a == 0
a, b := e1, e1
b.Neg(&b)
a.Add(&a, &b)
if !a.IsZero() {
panic("a + -a != 0")
}
}
return fuzzNormal
}
// SetRawBytes reads up to Bytes (bytes needed to represent Element) from reader
// and interpret it as big endian uint64
// used for fuzzing purposes only
func (z *Element) SetRawBytes(r io.Reader) {
buf := make([]byte, 8)
for i := 0; i < len(z); i++ {
if _, err := io.ReadFull(r, buf); err != nil {
goto eof
}
z[i] = binary.BigEndian.Uint64(buf[:])
}
eof:
z[3] %= qElement[3]
if z.BiggerModulus() {
var b uint64
z[0], b = bits.Sub64(z[0], qElement[0], 0)
z[1], b = bits.Sub64(z[1], qElement[1], b)
z[2], b = bits.Sub64(z[2], qElement[2], b)
z[3], b = bits.Sub64(z[3], qElement[3], b)
}
return
}
func (z *Element) BiggerModulus() bool {
if z[3] > qElement[3] {
return true
}
if z[3] < qElement[3] {
return false
}
if z[2] > qElement[2] {
return true
}
if z[2] < qElement[2] {
return false
}
if z[1] > qElement[1] {
return true
}
if z[1] < qElement[1] {
return false
}
return z[0] >= qElement[0]
}

+ 466
- 0
ff/element_mul_adx_amd64.s

@ -0,0 +1,466 @@
// +build amd64_adx
// Copyright 2020 ConsenSys Software Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "textflag.h"
#include "funcdata.h"
// modulus q
DATA q<>+0(SB)/8, $0x43e1f593f0000001
DATA q<>+8(SB)/8, $0x2833e84879b97091
DATA q<>+16(SB)/8, $0xb85045b68181585d
DATA q<>+24(SB)/8, $0x30644e72e131a029
GLOBL q<>(SB), (RODATA+NOPTR), $32
// qInv0 q'[0]
DATA qInv0<>(SB)/8, $0xc2e1f593efffffff
GLOBL qInv0<>(SB), (RODATA+NOPTR), $8
#define REDUCE(ra0, ra1, ra2, ra3, rb0, rb1, rb2, rb3) \
MOVQ ra0, rb0; \
SUBQ q<>(SB), ra0; \
MOVQ ra1, rb1; \
SBBQ q<>+8(SB), ra1; \
MOVQ ra2, rb2; \
SBBQ q<>+16(SB), ra2; \
MOVQ ra3, rb3; \
SBBQ q<>+24(SB), ra3; \
CMOVQCS rb0, ra0; \
CMOVQCS rb1, ra1; \
CMOVQCS rb2, ra2; \
CMOVQCS rb3, ra3; \
// mul(res, x, y *Element)
TEXT ·mul(SB), NOSPLIT, $0-24
// the algorithm is described here
// https://hackmd.io/@zkteam/modular_multiplication
// however, to benefit from the ADCX and ADOX carry chains
// we split the inner loops in 2:
// for i=0 to N-1
// for j=0 to N-1
// (A,t[j]) := t[j] + x[j]*y[i] + A
// m := t[0]*q'[0] mod W
// C,_ := t[0] + m*q[0]
// for j=1 to N-1
// (C,t[j-1]) := t[j] + m*q[j] + C
// t[N-1] = C + A
MOVQ x+8(FP), SI
// x[0] -> DI
// x[1] -> R8
// x[2] -> R9
// x[3] -> R10
MOVQ 0(SI), DI
MOVQ 8(SI), R8
MOVQ 16(SI), R9
MOVQ 24(SI), R10
MOVQ y+16(FP), R11
// A -> BP
// t[0] -> R14
// t[1] -> R15
// t[2] -> CX
// t[3] -> BX
// clear the flags
XORQ AX, AX
MOVQ 0(R11), DX
// (A,t[0]) := x[0]*y[0] + A
MULXQ DI, R14, R15
// (A,t[1]) := x[1]*y[0] + A
MULXQ R8, AX, CX
ADOXQ AX, R15
// (A,t[2]) := x[2]*y[0] + A
MULXQ R9, AX, BX
ADOXQ AX, CX
// (A,t[3]) := x[3]*y[0] + A
MULXQ R10, AX, BP
ADOXQ AX, BX
// A += carries from ADCXQ and ADOXQ
MOVQ $0, AX
ADOXQ AX, BP
// m := t[0]*q'[0] mod W
MOVQ qInv0<>(SB), DX
IMULQ R14, DX
// clear the flags
XORQ AX, AX
// C,_ := t[0] + m*q[0]
MULXQ q<>+0(SB), AX, R12
ADCXQ R14, AX
MOVQ R12, R14
// (C,t[0]) := t[1] + m*q[1] + C
ADCXQ R15, R14
MULXQ q<>+8(SB), AX, R15
ADOXQ AX, R14
// (C,t[1]) := t[2] + m*q[2] + C
ADCXQ CX, R15
MULXQ q<>+16(SB), AX, CX
ADOXQ AX, R15
// (C,t[2]) := t[3] + m*q[3] + C
ADCXQ BX, CX
MULXQ q<>+24(SB), AX, BX
ADOXQ AX, CX
// t[3] = C + A
MOVQ $0, AX
ADCXQ AX, BX
ADOXQ BP, BX
// clear the flags
XORQ AX, AX
MOVQ 8(R11), DX
// (A,t[0]) := t[0] + x[0]*y[1] + A
MULXQ DI, AX, BP
ADOXQ AX, R14
// (A,t[1]) := t[1] + x[1]*y[1] + A
ADCXQ BP, R15
MULXQ R8, AX, BP
ADOXQ AX, R15
// (A,t[2]) := t[2] + x[2]*y[1] + A
ADCXQ BP, CX
MULXQ R9, AX, BP
ADOXQ AX, CX
// (A,t[3]) := t[3] + x[3]*y[1] + A
ADCXQ BP, BX
MULXQ R10, AX, BP
ADOXQ AX, BX
// A += carries from ADCXQ and ADOXQ
MOVQ $0, AX
ADCXQ AX, BP
ADOXQ AX, BP
// m := t[0]*q'[0] mod W
MOVQ qInv0<>(SB), DX
IMULQ R14, DX
// clear the flags
XORQ AX, AX
// C,_ := t[0] + m*q[0]
MULXQ q<>+0(SB), AX, R12
ADCXQ R14, AX
MOVQ R12, R14
// (C,t[0]) := t[1] + m*q[1] + C
ADCXQ R15, R14
MULXQ q<>+8(SB), AX, R15
ADOXQ AX, R14
// (C,t[1]) := t[2] + m*q[2] + C
ADCXQ CX, R15
MULXQ q<>+16(SB), AX, CX
ADOXQ AX, R15
// (C,t[2]) := t[3] + m*q[3] + C
ADCXQ BX, CX
MULXQ q<>+24(SB), AX, BX
ADOXQ AX, CX
// t[3] = C + A
MOVQ $0, AX
ADCXQ AX, BX
ADOXQ BP, BX
// clear the flags
XORQ AX, AX
MOVQ 16(R11), DX
// (A,t[0]) := t[0] + x[0]*y[2] + A
MULXQ DI, AX, BP
ADOXQ AX, R14
// (A,t[1]) := t[1] + x[1]*y[2] + A
ADCXQ BP, R15
MULXQ R8, AX, BP
ADOXQ AX, R15
// (A,t[2]) := t[2] + x[2]*y[2] + A
ADCXQ BP, CX
MULXQ R9, AX, BP
ADOXQ AX, CX
// (A,t[3]) := t[3] + x[3]*y[2] + A
ADCXQ BP, BX
MULXQ R10, AX, BP
ADOXQ AX, BX
// A += carries from ADCXQ and ADOXQ
MOVQ $0, AX
ADCXQ AX, BP
ADOXQ AX, BP
// m := t[0]*q'[0] mod W
MOVQ qInv0<>(SB), DX
IMULQ R14, DX
// clear the flags
XORQ AX, AX
// C,_ := t[0] + m*q[0]
MULXQ q<>+0(SB), AX, R12
ADCXQ R14, AX
MOVQ R12, R14
// (C,t[0]) := t[1] + m*q[1] + C
ADCXQ R15, R14
MULXQ q<>+8(SB), AX, R15
ADOXQ AX, R14
// (C,t[1]) := t[2] + m*q[2] + C
ADCXQ CX, R15
MULXQ q<>+16(SB), AX, CX
ADOXQ AX, R15
// (C,t[2]) := t[3] + m*q[3] + C
ADCXQ BX, CX
MULXQ q<>+24(SB), AX, BX
ADOXQ AX, CX
// t[3] = C + A
MOVQ $0, AX
ADCXQ AX, BX
ADOXQ BP, BX
// clear the flags
XORQ AX, AX
MOVQ 24(R11), DX
// (A,t[0]) := t[0] + x[0]*y[3] + A
MULXQ DI, AX, BP
ADOXQ AX, R14
// (A,t[1]) := t[1] + x[1]*y[3] + A
ADCXQ BP, R15
MULXQ R8, AX, BP
ADOXQ AX, R15
// (A,t[2]) := t[2] + x[2]*y[3] + A
ADCXQ BP, CX
MULXQ R9, AX, BP
ADOXQ AX, CX
// (A,t[3]) := t[3] + x[3]*y[3] + A
ADCXQ BP, BX
MULXQ R10, AX, BP
ADOXQ AX, BX
// A += carries from ADCXQ and ADOXQ
MOVQ $0, AX
ADCXQ AX, BP
ADOXQ AX, BP
// m := t[0]*q'[0] mod W
MOVQ qInv0<>(SB), DX
IMULQ R14, DX
// clear the flags
XORQ AX, AX
// C,_ := t[0] + m*q[0]
MULXQ q<>+0(SB), AX, R12
ADCXQ R14, AX
MOVQ R12, R14
// (C,t[0]) := t[1] + m*q[1] + C
ADCXQ R15, R14
MULXQ q<>+8(SB), AX, R15
ADOXQ AX, R14
// (C,t[1]) := t[2] + m*q[2] + C
ADCXQ CX, R15
MULXQ q<>+16(SB), AX, CX
ADOXQ AX, R15
// (C,t[2]) := t[3] + m*q[3] + C
ADCXQ BX, CX
MULXQ q<>+24(SB), AX, BX
ADOXQ AX, CX
// t[3] = C + A
MOVQ $0, AX
ADCXQ AX, BX
ADOXQ BP, BX
// reduce element(R14,R15,CX,BX) using temp registers (R13,SI,R12,R11)
REDUCE(R14,R15,CX,BX,R13,SI,R12,R11)
MOVQ res+0(FP), AX
MOVQ R14, 0(AX)
MOVQ R15, 8(AX)
MOVQ CX, 16(AX)
MOVQ BX, 24(AX)
RET
TEXT ·fromMont(SB), NOSPLIT, $0-8
// the algorithm is described here
// https://hackmd.io/@zkteam/modular_multiplication
// when y = 1 we have:
// for i=0 to N-1
// t[i] = x[i]
// for i=0 to N-1
// m := t[0]*q'[0] mod W
// C,_ := t[0] + m*q[0]
// for j=1 to N-1
// (C,t[j-1]) := t[j] + m*q[j] + C
// t[N-1] = C
MOVQ res+0(FP), DX
MOVQ 0(DX), R14
MOVQ 8(DX), R15
MOVQ 16(DX), CX
MOVQ 24(DX), BX
XORQ DX, DX
// m := t[0]*q'[0] mod W
MOVQ qInv0<>(SB), DX
IMULQ R14, DX
XORQ AX, AX
// C,_ := t[0] + m*q[0]
MULXQ q<>+0(SB), AX, BP
ADCXQ R14, AX
MOVQ BP, R14
// (C,t[0]) := t[1] + m*q[1] + C
ADCXQ R15, R14
MULXQ q<>+8(SB), AX, R15
ADOXQ AX, R14
// (C,t[1]) := t[2] + m*q[2] + C
ADCXQ CX, R15
MULXQ q<>+16(SB), AX, CX
ADOXQ AX, R15
// (C,t[2]) := t[3] + m*q[3] + C
ADCXQ BX, CX
MULXQ q<>+24(SB), AX, BX
ADOXQ AX, CX
MOVQ $0, AX
ADCXQ AX, BX
ADOXQ AX, BX
XORQ DX, DX
// m := t[0]*q'[0] mod W
MOVQ qInv0<>(SB), DX
IMULQ R14, DX
XORQ AX, AX
// C,_ := t[0] + m*q[0]
MULXQ q<>+0(SB), AX, BP
ADCXQ R14, AX
MOVQ BP, R14
// (C,t[0]) := t[1] + m*q[1] + C
ADCXQ R15, R14
MULXQ q<>+8(SB), AX, R15
ADOXQ AX, R14
// (C,t[1]) := t[2] + m*q[2] + C
ADCXQ CX, R15
MULXQ q<>+16(SB), AX, CX
ADOXQ AX, R15
// (C,t[2]) := t[3] + m*q[3] + C
ADCXQ BX, CX
MULXQ q<>+24(SB), AX, BX
ADOXQ AX, CX
MOVQ $0, AX
ADCXQ AX, BX
ADOXQ AX, BX
XORQ DX, DX
// m := t[0]*q'[0] mod W
MOVQ qInv0<>(SB), DX
IMULQ R14, DX
XORQ AX, AX
// C,_ := t[0] + m*q[0]
MULXQ q<>+0(SB), AX, BP
ADCXQ R14, AX
MOVQ BP, R14
// (C,t[0]) := t[1] + m*q[1] + C
ADCXQ R15, R14
MULXQ q<>+8(SB), AX, R15
ADOXQ AX, R14
// (C,t[1]) := t[2] + m*q[2] + C
ADCXQ CX, R15
MULXQ q<>+16(SB), AX, CX
ADOXQ AX, R15
// (C,t[2]) := t[3] + m*q[3] + C
ADCXQ BX, CX
MULXQ q<>+24(SB), AX, BX
ADOXQ AX, CX
MOVQ $0, AX
ADCXQ AX, BX
ADOXQ AX, BX
XORQ DX, DX
// m := t[0]*q'[0] mod W
MOVQ qInv0<>(SB), DX
IMULQ R14, DX
XORQ AX, AX
// C,_ := t[0] + m*q[0]
MULXQ q<>+0(SB), AX, BP
ADCXQ R14, AX
MOVQ BP, R14
// (C,t[0]) := t[1] + m*q[1] + C
ADCXQ R15, R14
MULXQ q<>+8(SB), AX, R15
ADOXQ AX, R14
// (C,t[1]) := t[2] + m*q[2] + C
ADCXQ CX, R15
MULXQ q<>+16(SB), AX, CX
ADOXQ AX, R15
// (C,t[2]) := t[3] + m*q[3] + C
ADCXQ BX, CX
MULXQ q<>+24(SB), AX, BX
ADOXQ AX, CX
MOVQ $0, AX
ADCXQ AX, BX
ADOXQ AX, BX
// reduce element(R14,R15,CX,BX) using temp registers (SI,DI,R8,R9)
REDUCE(R14,R15,CX,BX,SI,DI,R8,R9)
MOVQ res+0(FP), AX
MOVQ R14, 0(AX)
MOVQ R15, 8(AX)
MOVQ CX, 16(AX)
MOVQ BX, 24(AX)
RET

+ 488
- 0
ff/element_mul_amd64.s

@ -0,0 +1,488 @@
// +build !amd64_adx
// Copyright 2020 ConsenSys Software Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "textflag.h"
#include "funcdata.h"
// modulus q
DATA q<>+0(SB)/8, $0x43e1f593f0000001
DATA q<>+8(SB)/8, $0x2833e84879b97091
DATA q<>+16(SB)/8, $0xb85045b68181585d
DATA q<>+24(SB)/8, $0x30644e72e131a029
GLOBL q<>(SB), (RODATA+NOPTR), $32
// qInv0 q'[0]
DATA qInv0<>(SB)/8, $0xc2e1f593efffffff
GLOBL qInv0<>(SB), (RODATA+NOPTR), $8
#define REDUCE(ra0, ra1, ra2, ra3, rb0, rb1, rb2, rb3) \
MOVQ ra0, rb0; \
SUBQ q<>(SB), ra0; \
MOVQ ra1, rb1; \
SBBQ q<>+8(SB), ra1; \
MOVQ ra2, rb2; \
SBBQ q<>+16(SB), ra2; \
MOVQ ra3, rb3; \
SBBQ q<>+24(SB), ra3; \
CMOVQCS rb0, ra0; \
CMOVQCS rb1, ra1; \
CMOVQCS rb2, ra2; \
CMOVQCS rb3, ra3; \
// mul(res, x, y *Element)
TEXT ·mul(SB), $24-24
// the algorithm is described here
// https://hackmd.io/@zkteam/modular_multiplication
// however, to benefit from the ADCX and ADOX carry chains
// we split the inner loops in 2:
// for i=0 to N-1
// for j=0 to N-1
// (A,t[j]) := t[j] + x[j]*y[i] + A
// m := t[0]*q'[0] mod W
// C,_ := t[0] + m*q[0]
// for j=1 to N-1
// (C,t[j-1]) := t[j] + m*q[j] + C
// t[N-1] = C + A
NO_LOCAL_POINTERS
CMPB ·supportAdx(SB), $1
JNE l1
MOVQ x+8(FP), SI
// x[0] -> DI
// x[1] -> R8
// x[2] -> R9
// x[3] -> R10
MOVQ 0(SI), DI
MOVQ 8(SI), R8
MOVQ 16(SI), R9
MOVQ 24(SI), R10
MOVQ y+16(FP), R11
// A -> BP
// t[0] -> R14
// t[1] -> R15
// t[2] -> CX
// t[3] -> BX
// clear the flags
XORQ AX, AX
MOVQ 0(R11), DX
// (A,t[0]) := x[0]*y[0] + A
MULXQ DI, R14, R15
// (A,t[1]) := x[1]*y[0] + A
MULXQ R8, AX, CX
ADOXQ AX, R15
// (A,t[2]) := x[2]*y[0] + A
MULXQ R9, AX, BX
ADOXQ AX, CX
// (A,t[3]) := x[3]*y[0] + A
MULXQ R10, AX, BP
ADOXQ AX, BX
// A += carries from ADCXQ and ADOXQ
MOVQ $0, AX
ADOXQ AX, BP
// m := t[0]*q'[0] mod W
MOVQ qInv0<>(SB), DX
IMULQ R14, DX
// clear the flags
XORQ AX, AX
// C,_ := t[0] + m*q[0]
MULXQ q<>+0(SB), AX, R12
ADCXQ R14, AX
MOVQ R12, R14
// (C,t[0]) := t[1] + m*q[1] + C
ADCXQ R15, R14
MULXQ q<>+8(SB), AX, R15
ADOXQ AX, R14
// (C,t[1]) := t[2] + m*q[2] + C
ADCXQ CX, R15
MULXQ q<>+16(SB), AX, CX
ADOXQ AX, R15
// (C,t[2]) := t[3] + m*q[3] + C
ADCXQ BX, CX
MULXQ q<>+24(SB), AX, BX
ADOXQ AX, CX
// t[3] = C + A
MOVQ $0, AX
ADCXQ AX, BX
ADOXQ BP, BX
// clear the flags
XORQ AX, AX
MOVQ 8(R11), DX
// (A,t[0]) := t[0] + x[0]*y[1] + A
MULXQ DI, AX, BP
ADOXQ AX, R14
// (A,t[1]) := t[1] + x[1]*y[1] + A
ADCXQ BP, R15
MULXQ R8, AX, BP
ADOXQ AX, R15
// (A,t[2]) := t[2] + x[2]*y[1] + A
ADCXQ BP, CX
MULXQ R9, AX, BP
ADOXQ AX, CX
// (A,t[3]) := t[3] + x[3]*y[1] + A
ADCXQ BP, BX
MULXQ R10, AX, BP
ADOXQ AX, BX
// A += carries from ADCXQ and ADOXQ
MOVQ $0, AX
ADCXQ AX, BP
ADOXQ AX, BP
// m := t[0]*q'[0] mod W
MOVQ qInv0<>(SB), DX
IMULQ R14, DX
// clear the flags
XORQ AX, AX
// C,_ := t[0] + m*q[0]
MULXQ q<>+0(SB), AX, R12
ADCXQ R14, AX
MOVQ R12, R14
// (C,t[0]) := t[1] + m*q[1] + C
ADCXQ R15, R14
MULXQ q<>+8(SB), AX, R15
ADOXQ AX, R14
// (C,t[1]) := t[2] + m*q[2] + C
ADCXQ CX, R15
MULXQ q<>+16(SB), AX, CX
ADOXQ AX, R15
// (C,t[2]) := t[3] + m*q[3] + C
ADCXQ BX, CX
MULXQ q<>+24(SB), AX, BX
ADOXQ AX, CX
// t[3] = C + A
MOVQ $0, AX
ADCXQ AX, BX
ADOXQ BP, BX
// clear the flags
XORQ AX, AX
MOVQ 16(R11), DX
// (A,t[0]) := t[0] + x[0]*y[2] + A
MULXQ DI, AX, BP
ADOXQ AX, R14
// (A,t[1]) := t[1] + x[1]*y[2] + A
ADCXQ BP, R15
MULXQ R8, AX, BP
ADOXQ AX, R15
// (A,t[2]) := t[2] + x[2]*y[2] + A
ADCXQ BP, CX
MULXQ R9, AX, BP
ADOXQ AX, CX
// (A,t[3]) := t[3] + x[3]*y[2] + A
ADCXQ BP, BX
MULXQ R10, AX, BP
ADOXQ AX, BX
// A += carries from ADCXQ and ADOXQ
MOVQ $0, AX
ADCXQ AX, BP
ADOXQ AX, BP
// m := t[0]*q'[0] mod W
MOVQ qInv0<>(SB), DX
IMULQ R14, DX
// clear the flags
XORQ AX, AX
// C,_ := t[0] + m*q[0]
MULXQ q<>+0(SB), AX, R12
ADCXQ R14, AX
MOVQ R12, R14
// (C,t[0]) := t[1] + m*q[1] + C
ADCXQ R15, R14
MULXQ q<>+8(SB), AX, R15
ADOXQ AX, R14
// (C,t[1]) := t[2] + m*q[2] + C
ADCXQ CX, R15
MULXQ q<>+16(SB), AX, CX
ADOXQ AX, R15
// (C,t[2]) := t[3] + m*q[3] + C
ADCXQ BX, CX
MULXQ q<>+24(SB), AX, BX
ADOXQ AX, CX
// t[3] = C + A
MOVQ $0, AX
ADCXQ AX, BX
ADOXQ BP, BX
// clear the flags
XORQ AX, AX
MOVQ 24(R11), DX
// (A,t[0]) := t[0] + x[0]*y[3] + A
MULXQ DI, AX, BP
ADOXQ AX, R14
// (A,t[1]) := t[1] + x[1]*y[3] + A
ADCXQ BP, R15
MULXQ R8, AX, BP
ADOXQ AX, R15
// (A,t[2]) := t[2] + x[2]*y[3] + A
ADCXQ BP, CX
MULXQ R9, AX, BP
ADOXQ AX, CX
// (A,t[3]) := t[3] + x[3]*y[3] + A
ADCXQ BP, BX
MULXQ R10, AX, BP
ADOXQ AX, BX
// A += carries from ADCXQ and ADOXQ
MOVQ $0, AX
ADCXQ AX, BP
ADOXQ AX, BP
// m := t[0]*q'[0] mod W
MOVQ qInv0<>(SB), DX
IMULQ R14, DX
// clear the flags
XORQ AX, AX
// C,_ := t[0] + m*q[0]
MULXQ q<>+0(SB), AX, R12
ADCXQ R14, AX
MOVQ R12, R14
// (C,t[0]) := t[1] + m*q[1] + C
ADCXQ R15, R14
MULXQ q<>+8(SB), AX, R15
ADOXQ AX, R14
// (C,t[1]) := t[2] + m*q[2] + C
ADCXQ CX, R15
MULXQ q<>+16(SB), AX, CX
ADOXQ AX, R15
// (C,t[2]) := t[3] + m*q[3] + C
ADCXQ BX, CX
MULXQ q<>+24(SB), AX, BX
ADOXQ AX, CX
// t[3] = C + A
MOVQ $0, AX
ADCXQ AX, BX
ADOXQ BP, BX
// reduce element(R14,R15,CX,BX) using temp registers (R13,SI,R12,R11)
REDUCE(R14,R15,CX,BX,R13,SI,R12,R11)
MOVQ res+0(FP), AX
MOVQ R14, 0(AX)
MOVQ R15, 8(AX)
MOVQ CX, 16(AX)
MOVQ BX, 24(AX)
RET
l1:
MOVQ res+0(FP), AX
MOVQ AX, (SP)
MOVQ x+8(FP), AX
MOVQ AX, 8(SP)
MOVQ y+16(FP), AX
MOVQ AX, 16(SP)
CALL ·_mulGeneric(SB)
RET
TEXT ·fromMont(SB), $8-8
NO_LOCAL_POINTERS
// the algorithm is described here
// https://hackmd.io/@zkteam/modular_multiplication
// when y = 1 we have:
// for i=0 to N-1
// t[i] = x[i]
// for i=0 to N-1
// m := t[0]*q'[0] mod W
// C,_ := t[0] + m*q[0]
// for j=1 to N-1
// (C,t[j-1]) := t[j] + m*q[j] + C
// t[N-1] = C
CMPB ·supportAdx(SB), $1
JNE l2
MOVQ res+0(FP), DX
MOVQ 0(DX), R14
MOVQ 8(DX), R15
MOVQ 16(DX), CX
MOVQ 24(DX), BX
XORQ DX, DX
// m := t[0]*q'[0] mod W
MOVQ qInv0<>(SB), DX
IMULQ R14, DX
XORQ AX, AX
// C,_ := t[0] + m*q[0]
MULXQ q<>+0(SB), AX, BP
ADCXQ R14, AX
MOVQ BP, R14
// (C,t[0]) := t[1] + m*q[1] + C
ADCXQ R15, R14
MULXQ q<>+8(SB), AX, R15
ADOXQ AX, R14
// (C,t[1]) := t[2] + m*q[2] + C
ADCXQ CX, R15
MULXQ q<>+16(SB), AX, CX
ADOXQ AX, R15
// (C,t[2]) := t[3] + m*q[3] + C
ADCXQ BX, CX
MULXQ q<>+24(SB), AX, BX
ADOXQ AX, CX
MOVQ $0, AX
ADCXQ AX, BX
ADOXQ AX, BX
XORQ DX, DX
// m := t[0]*q'[0] mod W
MOVQ qInv0<>(SB), DX
IMULQ R14, DX
XORQ AX, AX
// C,_ := t[0] + m*q[0]
MULXQ q<>+0(SB), AX, BP
ADCXQ R14, AX
MOVQ BP, R14
// (C,t[0]) := t[1] + m*q[1] + C
ADCXQ R15, R14
MULXQ q<>+8(SB), AX, R15
ADOXQ AX, R14
// (C,t[1]) := t[2] + m*q[2] + C
ADCXQ CX, R15
MULXQ q<>+16(SB), AX, CX
ADOXQ AX, R15
// (C,t[2]) := t[3] + m*q[3] + C
ADCXQ BX, CX
MULXQ q<>+24(SB), AX, BX
ADOXQ AX, CX
MOVQ $0, AX
ADCXQ AX, BX
ADOXQ AX, BX
XORQ DX, DX
// m := t[0]*q'[0] mod W
MOVQ qInv0<>(SB), DX
IMULQ R14, DX
XORQ AX, AX
// C,_ := t[0] + m*q[0]
MULXQ q<>+0(SB), AX, BP
ADCXQ R14, AX
MOVQ BP, R14
// (C,t[0]) := t[1] + m*q[1] + C
ADCXQ R15, R14
MULXQ q<>+8(SB), AX, R15
ADOXQ AX, R14
// (C,t[1]) := t[2] + m*q[2] + C
ADCXQ CX, R15
MULXQ q<>+16(SB), AX, CX
ADOXQ AX, R15
// (C,t[2]) := t[3] + m*q[3] + C
ADCXQ BX, CX
MULXQ q<>+24(SB), AX, BX
ADOXQ AX, CX
MOVQ $0, AX
ADCXQ AX, BX
ADOXQ AX, BX
XORQ DX, DX
// m := t[0]*q'[0] mod W
MOVQ qInv0<>(SB), DX
IMULQ R14, DX
XORQ AX, AX
// C,_ := t[0] + m*q[0]
MULXQ q<>+0(SB), AX, BP
ADCXQ R14, AX
MOVQ BP, R14
// (C,t[0]) := t[1] + m*q[1] + C
ADCXQ R15, R14
MULXQ q<>+8(SB), AX, R15
ADOXQ AX, R14
// (C,t[1]) := t[2] + m*q[2] + C
ADCXQ CX, R15
MULXQ q<>+16(SB), AX, CX
ADOXQ AX, R15
// (C,t[2]) := t[3] + m*q[3] + C
ADCXQ BX, CX
MULXQ q<>+24(SB), AX, BX
ADOXQ AX, CX
MOVQ $0, AX
ADCXQ AX, BX
ADOXQ AX, BX
// reduce element(R14,R15,CX,BX) using temp registers (SI,DI,R8,R9)
REDUCE(R14,R15,CX,BX,SI,DI,R8,R9)
MOVQ res+0(FP), AX
MOVQ R14, 0(AX)
MOVQ R15, 8(AX)
MOVQ CX, 16(AX)
MOVQ BX, 24(AX)
RET
l2:
MOVQ res+0(FP), AX
MOVQ AX, (SP)
CALL ·_fromMontGeneric(SB)
RET

+ 50
- 0
ff/element_ops_amd64.go

@ -0,0 +1,50 @@
// Copyright 2020 ConsenSys Software Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// Code generated by consensys/gnark-crypto DO NOT EDIT
package ff
//go:noescape
func MulBy3(x *Element)
//go:noescape
func MulBy5(x *Element)
//go:noescape
func MulBy13(x *Element)
//go:noescape
func add(res, x, y *Element)
//go:noescape
func sub(res, x, y *Element)
//go:noescape
func neg(res, x *Element)
//go:noescape
func double(res, x *Element)
//go:noescape
func mul(res, x, y *Element)
//go:noescape
func fromMont(res *Element)
//go:noescape
func reduce(res *Element)
//go:noescape
func Butterfly(a, b *Element)

+ 340
- 0
ff/element_ops_amd64.s

@ -0,0 +1,340 @@
// Copyright 2020 ConsenSys Software Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "textflag.h"
#include "funcdata.h"
// modulus q
DATA q<>+0(SB)/8, $0x43e1f593f0000001
DATA q<>+8(SB)/8, $0x2833e84879b97091
DATA q<>+16(SB)/8, $0xb85045b68181585d
DATA q<>+24(SB)/8, $0x30644e72e131a029
GLOBL q<>(SB), (RODATA+NOPTR), $32
// qInv0 q'[0]
DATA qInv0<>(SB)/8, $0xc2e1f593efffffff
GLOBL qInv0<>(SB), (RODATA+NOPTR), $8
#define REDUCE(ra0, ra1, ra2, ra3, rb0, rb1, rb2, rb3) \
MOVQ ra0, rb0; \
SUBQ q<>(SB), ra0; \
MOVQ ra1, rb1; \
SBBQ q<>+8(SB), ra1; \
MOVQ ra2, rb2; \
SBBQ q<>+16(SB), ra2; \
MOVQ ra3, rb3; \
SBBQ q<>+24(SB), ra3; \
CMOVQCS rb0, ra0; \
CMOVQCS rb1, ra1; \
CMOVQCS rb2, ra2; \
CMOVQCS rb3, ra3; \
// add(res, x, y *Element)
TEXT ·add(SB), NOSPLIT, $0-24
MOVQ x+8(FP), AX
MOVQ 0(AX), CX
MOVQ 8(AX), BX
MOVQ 16(AX), SI
MOVQ 24(AX), DI
MOVQ y+16(FP), DX
ADDQ 0(DX), CX
ADCQ 8(DX), BX
ADCQ 16(DX), SI
ADCQ 24(DX), DI
// reduce element(CX,BX,SI,DI) using temp registers (R8,R9,R10,R11)
REDUCE(CX,BX,SI,DI,R8,R9,R10,R11)
MOVQ res+0(FP), R12
MOVQ CX, 0(R12)
MOVQ BX, 8(R12)
MOVQ SI, 16(R12)
MOVQ DI, 24(R12)
RET
// sub(res, x, y *Element)
TEXT ·sub(SB), NOSPLIT, $0-24
XORQ DI, DI
MOVQ x+8(FP), SI
MOVQ 0(SI), AX
MOVQ 8(SI), DX
MOVQ 16(SI), CX
MOVQ 24(SI), BX
MOVQ y+16(FP), SI
SUBQ 0(SI), AX
SBBQ 8(SI), DX
SBBQ 16(SI), CX
SBBQ 24(SI), BX
MOVQ $0x43e1f593f0000001, R8
MOVQ $0x2833e84879b97091, R9
MOVQ $0xb85045b68181585d, R10
MOVQ $0x30644e72e131a029, R11
CMOVQCC DI, R8
CMOVQCC DI, R9
CMOVQCC DI, R10
CMOVQCC DI, R11
ADDQ R8, AX
ADCQ R9, DX
ADCQ R10, CX
ADCQ R11, BX
MOVQ res+0(FP), R12
MOVQ AX, 0(R12)
MOVQ DX, 8(R12)
MOVQ CX, 16(R12)
MOVQ BX, 24(R12)
RET
// double(res, x *Element)
TEXT ·double(SB), NOSPLIT, $0-16
MOVQ x+8(FP), AX
MOVQ 0(AX), DX
MOVQ 8(AX), CX
MOVQ 16(AX), BX
MOVQ 24(AX), SI
ADDQ DX, DX
ADCQ CX, CX
ADCQ BX, BX
ADCQ SI, SI
// reduce element(DX,CX,BX,SI) using temp registers (DI,R8,R9,R10)
REDUCE(DX,CX,BX,SI,DI,R8,R9,R10)
MOVQ res+0(FP), R11
MOVQ DX, 0(R11)
MOVQ CX, 8(R11)
MOVQ BX, 16(R11)
MOVQ SI, 24(R11)
RET
// neg(res, x *Element)
TEXT ·neg(SB), NOSPLIT, $0-16
MOVQ res+0(FP), DI
MOVQ x+8(FP), AX
MOVQ 0(AX), DX
MOVQ 8(AX), CX
MOVQ 16(AX), BX
MOVQ 24(AX), SI
MOVQ DX, AX
ORQ CX, AX
ORQ BX, AX
ORQ SI, AX
TESTQ AX, AX
JEQ l1
MOVQ $0x43e1f593f0000001, R8
SUBQ DX, R8
MOVQ R8, 0(DI)
MOVQ $0x2833e84879b97091, R8
SBBQ CX, R8
MOVQ R8, 8(DI)
MOVQ $0xb85045b68181585d, R8
SBBQ BX, R8
MOVQ R8, 16(DI)
MOVQ $0x30644e72e131a029, R8
SBBQ SI, R8
MOVQ R8, 24(DI)
RET
l1:
MOVQ AX, 0(DI)
MOVQ AX, 8(DI)
MOVQ AX, 16(DI)
MOVQ AX, 24(DI)
RET
TEXT ·reduce(SB), NOSPLIT, $0-8
MOVQ res+0(FP), AX
MOVQ 0(AX), DX
MOVQ 8(AX), CX
MOVQ 16(AX), BX
MOVQ 24(AX), SI
// reduce element(DX,CX,BX,SI) using temp registers (DI,R8,R9,R10)
REDUCE(DX,CX,BX,SI,DI,R8,R9,R10)
MOVQ DX, 0(AX)
MOVQ CX, 8(AX)
MOVQ BX, 16(AX)
MOVQ SI, 24(AX)
RET
// MulBy3(x *Element)
TEXT ·MulBy3(SB), NOSPLIT, $0-8
MOVQ x+0(FP), AX
MOVQ 0(AX), DX
MOVQ 8(AX), CX
MOVQ 16(AX), BX
MOVQ 24(AX), SI
ADDQ DX, DX
ADCQ CX, CX
ADCQ BX, BX
ADCQ SI, SI
// reduce element(DX,CX,BX,SI) using temp registers (DI,R8,R9,R10)
REDUCE(DX,CX,BX,SI,DI,R8,R9,R10)
ADDQ 0(AX), DX
ADCQ 8(AX), CX
ADCQ 16(AX), BX
ADCQ 24(AX), SI
// reduce element(DX,CX,BX,SI) using temp registers (R11,R12,R13,R14)
REDUCE(DX,CX,BX,SI,R11,R12,R13,R14)
MOVQ DX, 0(AX)
MOVQ CX, 8(AX)
MOVQ BX, 16(AX)
MOVQ SI, 24(AX)
RET
// MulBy5(x *Element)
TEXT ·MulBy5(SB), NOSPLIT, $0-8
MOVQ x+0(FP), AX
MOVQ 0(AX), DX
MOVQ 8(AX), CX
MOVQ 16(AX), BX
MOVQ 24(AX), SI
ADDQ DX, DX
ADCQ CX, CX
ADCQ BX, BX
ADCQ SI, SI
// reduce element(DX,CX,BX,SI) using temp registers (DI,R8,R9,R10)
REDUCE(DX,CX,BX,SI,DI,R8,R9,R10)
ADDQ DX, DX
ADCQ CX, CX
ADCQ BX, BX
ADCQ SI, SI
// reduce element(DX,CX,BX,SI) using temp registers (R11,R12,R13,R14)
REDUCE(DX,CX,BX,SI,R11,R12,R13,R14)
ADDQ 0(AX), DX
ADCQ 8(AX), CX
ADCQ 16(AX), BX
ADCQ 24(AX), SI
// reduce element(DX,CX,BX,SI) using temp registers (R15,DI,R8,R9)
REDUCE(DX,CX,BX,SI,R15,DI,R8,R9)
MOVQ DX, 0(AX)
MOVQ CX, 8(AX)
MOVQ BX, 16(AX)
MOVQ SI, 24(AX)
RET
// MulBy13(x *Element)
TEXT ·MulBy13(SB), NOSPLIT, $0-8
MOVQ x+0(FP), AX
MOVQ 0(AX), DX
MOVQ 8(AX), CX
MOVQ 16(AX), BX
MOVQ 24(AX), SI
ADDQ DX, DX
ADCQ CX, CX
ADCQ BX, BX
ADCQ SI, SI
// reduce element(DX,CX,BX,SI) using temp registers (DI,R8,R9,R10)
REDUCE(DX,CX,BX,SI,DI,R8,R9,R10)
ADDQ DX, DX
ADCQ CX, CX
ADCQ BX, BX
ADCQ SI, SI
// reduce element(DX,CX,BX,SI) using temp registers (R11,R12,R13,R14)
REDUCE(DX,CX,BX,SI,R11,R12,R13,R14)
MOVQ DX, R11
MOVQ CX, R12
MOVQ BX, R13
MOVQ SI, R14
ADDQ DX, DX
ADCQ CX, CX
ADCQ BX, BX
ADCQ SI, SI
// reduce element(DX,CX,BX,SI) using temp registers (DI,R8,R9,R10)
REDUCE(DX,CX,BX,SI,DI,R8,R9,R10)
ADDQ R11, DX
ADCQ R12, CX
ADCQ R13, BX
ADCQ R14, SI
// reduce element(DX,CX,BX,SI) using temp registers (DI,R8,R9,R10)
REDUCE(DX,CX,BX,SI,DI,R8,R9,R10)
ADDQ 0(AX), DX
ADCQ 8(AX), CX
ADCQ 16(AX), BX
ADCQ 24(AX), SI
// reduce element(DX,CX,BX,SI) using temp registers (DI,R8,R9,R10)
REDUCE(DX,CX,BX,SI,DI,R8,R9,R10)
MOVQ DX, 0(AX)
MOVQ CX, 8(AX)
MOVQ BX, 16(AX)
MOVQ SI, 24(AX)
RET
// Butterfly(a, b *Element) sets a = a + b; b = a - b
TEXT ·Butterfly(SB), NOSPLIT, $0-16
MOVQ a+0(FP), AX
MOVQ 0(AX), CX
MOVQ 8(AX), BX
MOVQ 16(AX), SI
MOVQ 24(AX), DI
MOVQ CX, R8
MOVQ BX, R9
MOVQ SI, R10
MOVQ DI, R11
XORQ AX, AX
MOVQ b+8(FP), DX
ADDQ 0(DX), CX
ADCQ 8(DX), BX
ADCQ 16(DX), SI
ADCQ 24(DX), DI
SUBQ 0(DX), R8
SBBQ 8(DX), R9
SBBQ 16(DX), R10
SBBQ 24(DX), R11
MOVQ $0x43e1f593f0000001, R12
MOVQ $0x2833e84879b97091, R13
MOVQ $0xb85045b68181585d, R14
MOVQ $0x30644e72e131a029, R15
CMOVQCC AX, R12
CMOVQCC AX, R13
CMOVQCC AX, R14
CMOVQCC AX, R15
ADDQ R12, R8
ADCQ R13, R9
ADCQ R14, R10
ADCQ R15, R11
MOVQ R8, 0(DX)
MOVQ R9, 8(DX)
MOVQ R10, 16(DX)
MOVQ R11, 24(DX)
// reduce element(CX,BX,SI,DI) using temp registers (R8,R9,R10,R11)
REDUCE(CX,BX,SI,DI,R8,R9,R10,R11)
MOVQ a+0(FP), AX
MOVQ CX, 0(AX)
MOVQ BX, 8(AX)
MOVQ SI, 16(AX)
MOVQ DI, 24(AX)
RET

+ 78
- 0
ff/element_ops_noasm.go

@ -0,0 +1,78 @@
//go:build !amd64
// +build !amd64
// Copyright 2020 ConsenSys Software Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// Code generated by consensys/gnark-crypto DO NOT EDIT
package ff
// /!\ WARNING /!\
// this code has not been audited and is provided as-is. In particular,
// there is no security guarantees such as constant time implementation
// or side-channel attack resistance
// /!\ WARNING /!\
// MulBy3 x *= 3
func MulBy3(x *Element) {
mulByConstant(x, 3)
}
// MulBy5 x *= 5
func MulBy5(x *Element) {
mulByConstant(x, 5)
}
// MulBy13 x *= 13
func MulBy13(x *Element) {
mulByConstant(x, 13)
}
// Butterfly sets
// a = a + b
// b = a - b
func Butterfly(a, b *Element) {
_butterflyGeneric(a, b)
}
func mul(z, x, y *Element) {
_mulGeneric(z, x, y)
}
// FromMont converts z in place (i.e. mutates) from Montgomery to regular representation
// sets and returns z = z * 1
func fromMont(z *Element) {
_fromMontGeneric(z)
}
func add(z, x, y *Element) {
_addGeneric(z, x, y)
}
func double(z, x *Element) {
_doubleGeneric(z, x)
}
func sub(z, x, y *Element) {
_subGeneric(z, x, y)
}
func neg(z, x *Element) {
_negGeneric(z, x)
}
func reduce(z *Element) {
_reduceGeneric(z)
}

+ 1769
- 129
ff/element_test.go
File diff suppressed because it is too large
View File


+ 0
- 6
ff/util.go

@ -1,6 +0,0 @@
package ff
// NewElement returns a new empty *Element
func NewElement() *Element {
return &Element{}
}

+ 5
- 0
go.mod

@ -6,4 +6,9 @@ require (
github.com/dchest/blake512 v1.0.0
github.com/stretchr/testify v1.7.0
golang.org/x/crypto v0.0.0-20211117183948-ae814b36b871
golang.org/x/sys v0.0.0-20211216021012-1d35b9e2eb4e // indirect
github.com/davecgh/go-spew v1.1.0 // indirect
github.com/leanovate/gopter v0.2.9 // indirect
github.com/pmezard/go-difflib v1.0.0 // indirect
gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c // indirect
)

+ 4
- 0
go.sum

@ -2,6 +2,8 @@ github.com/davecgh/go-spew v1.1.0 h1:ZDRjVQ15GmhC3fiQ8ni8+OwkZQO4DARzQgrnXU1Liz8
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/dchest/blake512 v1.0.0 h1:oDFEQFIqFSeuA34xLtXZ/rWxCXdSjirjzPhey5EUvmA=
github.com/dchest/blake512 v1.0.0/go.mod h1:FV1x7xPPLWukZlpDpWQ88rF/SFwZ5qbskrzhLMB92JI=
github.com/leanovate/gopter v0.2.9 h1:fQjYxZaynp97ozCzfOyOuAGOU4aU/z37zf/tOujFk7c=
github.com/leanovate/gopter v0.2.9/go.mod h1:U2L/78B+KVFIx2VmW6onHJQzXtFb+p5y3y2Sh+Jxxv8=
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
@ -14,6 +16,8 @@ golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7w
golang.org/x/sys v0.0.0-20210423082822-04245dca01da/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1 h1:SrN+KX8Art/Sf4HNj6Zcz06G7VEz+7w9tdXTPOZ7+l4=
golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.0.0-20211216021012-1d35b9e2eb4e h1:fLOSk5Q00efkSvAm+4xcoXD+RRmLmmulPn5I3Y9F2EM=
golang.org/x/sys v0.0.0-20211216021012-1d35b9e2eb4e/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=

+ 1
- 1
poseidon/poseidon.go

@ -20,7 +20,7 @@ func zero() *ff.Element {
// exp5 performs x^5 mod p
// https://eprint.iacr.org/2019/458.pdf page 8
func exp5(a *ff.Element) {
a.Exp(*a, 5) //nolint:gomnd
a.Exp(*a, big.NewInt(5)) //nolint:gomnd
}
// exp5state perform exp5 for whole state

Loading…
Cancel
Save