feat: Plonk optimizations (#39)

* Fixed poseidion hash TOOD in fri/fri.go * optimized goldilocks * Another optimization * Down to 16 million * Finished TODOs
2 years ago · 89b5a01e4b
9 changed files with 171 additions and 82 deletions
--- a/.gitignore
+++ b/.gitignore
@ -15,4 +15,7 @@
 # vendor/

 gnark-ed25519
 gnark.pprof
 gnark.pprof

 # Output from pprof visualization
 verifier.png
--- a/README.md
+++ b/README.md
@ -19,20 +19,14 @@ To run the benchmark,
 go run benchmark.go
 ```

 Here are relevant numbers from a benchmark ran on an M1 Max with 10 CPU cores.
 ## Profiling

 First run the benchmark with profiling turned on
 ```
 11:04:08 INF compiling circuit
 11:04:08 INF parsed circuit inputs nbPublic=0 nbSecret=0
 11:12:30 INF building constraint system nbConstraints=6740784
 Generating witness 2023-03-28 11:12:42.702566 -0700 PDT m=+514.333410376
 Running circuit setup 2023-03-28 11:12:42.702666 -0700 PDT m=+514.333509834
 Creating proof 2023-03-28 11:18:58.881518 -0700 PDT m=+890.519971543
 11:18:59 DBG constraint system solver done backend=groth16 nbConstraints=6740784 took=675.361625
 11:19:10 DBG prover done backend=groth16 curve=bn254 nbConstraints=6740784 took=10512.664584
 Verifying proof 2023-03-28 11:19:10.169636 -0700 PDT m=+901.808314709
 11:19:10 DBG verifier done backend=groth16 curve=bn254 took=6.288792
 bn254 2023-03-28 11:19:10.175992 -0700 PDT m=+901.814670834
 go run benchmark.go -profile
 ```

 The circuit can be significantly optimized by using more efficient arithmetic for Goldilocks, among other things.
 Then use the following command to generate a visualization of the pprof
 ```
 go tool pprof --png gnark.pprof > verifier.png
 ```
--- a/benchmark.go
+++ b/benchmark.go
@ -282,8 +282,8 @@ func groth16Proof(r1cs constraint.ConstraintSystem, circuitName string, dummy bo
 }

 func main() {
 	plonky2Circuit := flag.String("plonky2-circuit", "", "plonky2 circuit to benchmark")
 	proofSystem := flag.String("proof-system", "groth16", "proof system to benchmark")
 	plonky2Circuit := flag.String("plonky2-circuit", "step", "plonky2 circuit to benchmark")
 	proofSystem := flag.String("proof-system", "plonk", "proof system to benchmark")
 	profileCircuit := flag.Bool("profile", true, "profile the circuit")
 	dummySetup := flag.Bool("dummy", true, "use dummy setup")
 	saveArtifacts := flag.Bool("save", false, "save circuit artifacts")
--- a/fri/fri.go
+++ b/fri/fri.go
@ -106,11 +106,15 @@ func (f *Chip) verifyMerkleProofToCapWithCapIndex(
 	currentDigest := f.poseidonBN254Chip.HashOrNoop(leafData)
 	for i, sibling := range proof.Siblings {
 		bit := leafIndexBits[i]
 		// TODO: Don't need to do two hashes by using a trick that the plonky2 verifier circuit does
 		// https://github.com/mir-protocol/plonky2/blob/973624f12d2d12d74422b3ea051358b9eaacb050/plonky2/src/gates/poseidon.rs#L298
 		leftHash := f.poseidonBN254Chip.TwoToOne(sibling, currentDigest)
 		rightHash := f.poseidonBN254Chip.TwoToOne(currentDigest, sibling)
 		currentDigest = f.api.Select(bit, leftHash, rightHash)

 		var inputs poseidon.BN254State
 		inputs[0] = frontend.Variable(0)
 		inputs[1] = frontend.Variable(0)
 		inputs[2] = f.api.Select(bit, sibling, currentDigest)
 		inputs[3] = f.api.Select(bit, currentDigest, sibling)
 		state := f.poseidonBN254Chip.Poseidon(inputs)

 		currentDigest = state[0]
 	}

 	// We assume that the cap_height is 4.  Create two levels of the Lookup2 circuit
@ -152,29 +156,6 @@ func (f *Chip) verifyInitialProof(xIndexBits []frontend.Variable, proof *variabl
 	}
 }

 // / We decompose FRI query indices into bits without verifying that the decomposition given by
 // / the prover is the canonical one. In particular, if `x_index < 2^field_bits - p`, then the
 // / prover could supply the binary encoding of either `x_index` or `x_index + p`, since they are
 // / congruent mod `p`. However, this only occurs with probability
 // /     p_ambiguous = (2^field_bits - p) / p
 // / which is small for the field that we use in practice.
 // /
 // / In particular, the soundness error of one FRI query is roughly the codeword rate, which
 // / is much larger than this ambiguous-element probability given any reasonable parameters.
 // / Thus ambiguous elements contribute a negligible amount to soundness error.
 // /
 // / Here we compare the probabilities as a sanity check, to verify the claim above.
 func (f *Chip) assertNoncanonicalIndicesOK() {
 	numAmbiguousElems := uint64(math.MaxUint64) - goldilocks.Modulus().Uint64() + 1
 	queryError := f.friParams.Config.Rate()
 	pAmbiguous := float64(numAmbiguousElems) / float64(goldilocks.Modulus().Uint64())

 	// TODO:  Check that pAmbiguous value is the same as the one in plonky2 verifier
 	if pAmbiguous >= queryError*1e-5 {
 		panic("A non-negligible portion of field elements are in the range that permits non-canonical encodings. Need to do more analysis or enforce canonical encodings.")
 	}
 }

 func (f *Chip) expFromBitsConstBase(
 	base goldilocks.Element,
 	exponentBits []frontend.Variable,
@ -209,7 +190,7 @@ func (f *Chip) calculateSubgroupX(
 ) gl.Variable {
 	// Compute x from its index
 	// `subgroup_x` is `subgroup[x_index]`, i.e., the actual field element in the domain.
 	// TODO - Make these as global values
 	// OPTIMIZE - Make these as global values
 	g := gl.NewVariable(gl.MULTIPLICATIVE_GROUP_GENERATOR.Uint64())
 	base := gl.PrimitiveRootOfUnity(nLog)

@ -343,7 +324,7 @@ func (f *Chip) computeEvaluation(

 	// The evaluation vector needs to be reordered first.  Permute the evals array such that each
 	// element's new index is the bit reverse of it's original index.
 	// TODO:  Optimization - Since the size of the evals array should be constant (e.g. 2^arityBits),
 	// OPTIMIZE - Since the size of the evals array should be constant (e.g. 2^arityBits),
 	//        we can just hard code the permutation.
 	permutedEvals := make([]gl.QuadraticExtensionVariable, len(evals))
 	for i := uint8(0); i < uint8(len(evals)); i++ {
@ -363,14 +344,14 @@ func (f *Chip) computeEvaluation(
 	xPoints := make([]gl.QuadraticExtensionVariable, len(evals))
 	yPoints := permutedEvals

 	// TODO: Make g_F a constant
 	// OPTIMIZE: Make g_F a constant
 	g_F := gl.NewVariable(g.Uint64()).ToQuadraticExtension()
 	xPoints[0] = gl.QuadraticExtensionVariable{cosetStart, gl.Zero()}
 	for i := 1; i < len(evals); i++ {
 		xPoints[i] = f.gl.MulExtension(xPoints[i-1], g_F)
 	}

 	// TODO:  This is n^2.  Is there a way to do this better?
 	// OPTIMIZE:  This is n^2.  Is there a way to do this better?
 	// Compute the barycentric weights
 	barycentricWeights := make([]gl.QuadraticExtensionVariable, len(xPoints))
 	for i := 0; i < len(xPoints); i++ {
@ -385,7 +366,7 @@ func (f *Chip) computeEvaluation(
 			}
 		}
 		// Take the inverse of the barycentric weights
 		// TODO: Can provide a witness to this value
 		// OPTIMIZE: Can provide a witness to this value
 		barycentricWeights[i] = f.gl.InverseExtension(barycentricWeights[i])
 	}

@ -403,7 +384,9 @@ func (f *Chip) verifyQueryRound(
 	nLog uint64,
 	roundProof *variables.FriQueryRound,
 ) {
 	f.assertNoncanonicalIndicesOK()
 	// Note assertNoncanonicalIndicesOK does not add any constraints, it's a sanity check on the config
 	assertNoncanonicalIndicesOK(*f.friParams)

 	xIndex = f.gl.Reduce(xIndex)
 	xIndexBits := f.api.ToBinary(xIndex.Limb, 64)[0 : f.friParams.DegreeBits+f.friParams.Config.RateBits]
 	capIndexBits := xIndexBits[len(xIndexBits)-int(f.friParams.Config.CapHeight):]
@ -511,21 +494,18 @@ func (f *Chip) VerifyFriProof(
 	initialMerkleCaps []variables.FriMerkleCap,
 	friProof *variables.FriProof,
 ) {
 	// TODO:  Check fri config
 	/* if let Some(max_arity_bits) = params.max_arity_bits() {
 		self.check_recursion_config::<C>(max_arity_bits);
 	}

 	debug_assert_eq!(
 		params.final_poly_len(),
 		proof.final_poly.len(),
 		"Final polynomial has wrong degree."
 	); */
 	// Not adding any constraints but a sanity check on the proof shape matching the friParams (constant).
 	validateFriProofShape(friProof, instance, f.friParams)

 	// Check POW

 	f.assertLeadingZeros(friChallenges.FriPowResponse, f.friParams.Config)

 	// Check that parameters are coherent. Not adding any constraints but a sanity check
 	// on the proof shape matching the friParams.
 	if int(f.friParams.Config.NumQueryRounds) != len(friProof.QueryRoundProofs) {
 		panic("Number of query rounds does not match config.")
 	}

 	precomputedReducedEvals := f.fromOpeningsAndAlpha(&openings, friChallenges.FriAlpha)

 	// Size of the LDE domain.
--- a/fri/fri_utils.go
+++ b/fri/fri_utils.go
@ -1,7 +1,11 @@
 package fri

 import (
 	"math"

 	"github.com/consensys/gnark-crypto/field/goldilocks"
 	"github.com/succinctlabs/gnark-plonky2-verifier/types"
 	"github.com/succinctlabs/gnark-plonky2-verifier/variables"
 )

 type PolynomialInfo struct {
@ -146,3 +150,79 @@ func friAllPolys(c *types.CommonCircuitData) []PolynomialInfo {

 	return returnArr
 }

 // This does not add any constraints, it's just a sanity check on the friParams
 // It's a 1-1 port of assert_noncanonical_indices_ok from fri::recursive_verifier in plonky2
 func assertNoncanonicalIndicesOK(friParams types.FriParams) {
 	numAmbiguousElems := uint64(math.MaxUint64) - goldilocks.Modulus().Uint64() + 1
 	queryError := friParams.Config.Rate()
 	pAmbiguous := float64(numAmbiguousElems) / float64(goldilocks.Modulus().Uint64())
 	if pAmbiguous >= queryError*1e-5 {
 		panic("A non-negligible portion of field elements are in the range that permits non-canonical encodings. Need to do more analysis or enforce canonical encodings.")
 	}
 }

 // This does not add any constraints, it is just a sanity check on the shapes of the proof variable
 // and given FriParams. It's a 1-1 port of validate_fri_proof_shape from fri::validate_shape in plonky2
 func validateFriProofShape(proof *variables.FriProof, instance InstanceInfo, params *types.FriParams) {
 	const SALT_SIZE = 4

 	commitPhaseMerkleCaps := proof.CommitPhaseMerkleCaps
 	queryRoundProofs := proof.QueryRoundProofs
 	finalPoly := proof.FinalPoly

 	capHeight := params.Config.CapHeight
 	for _, cap := range commitPhaseMerkleCaps {
 		if 1<<capHeight != len(cap) {
 			panic("config cap_height does not match commit_phase_merkle_caps")
 		}
 	}

 	for _, queryRound := range queryRoundProofs {
 		initialTreesProof := queryRound.InitialTreesProof
 		steps := queryRound.Steps
 		if len(initialTreesProof.EvalsProofs) != len(instance.Oracles) {
 			panic("eval proofs length is not equal to instance oracles length")
 		}
 		for i, evalProof := range initialTreesProof.EvalsProofs {
 			leaf := evalProof.Elements
 			merkleProof := evalProof.MerkleProof
 			oracle := instance.Oracles[i]
 			salt_size := 0
 			if oracle.Blinding && params.Hiding {
 				salt_size = SALT_SIZE
 			}
 			if len(leaf) != (int(oracle.NumPolys) + salt_size) {
 				panic("eval proof leaf length doesn't match oracle info")
 			}
 			if len(merkleProof.Siblings)+int(capHeight) != params.LdeBits() {
 				panic("length of merkle proof + capHeight doesn't match lde_bits from params")
 			}
 		}
 		if len(steps) != len(params.ReductionArityBits) {
 			panic("length of steps != params.reduction_arity_bits")
 		}

 		codewordLenBits := params.LdeBits()
 		for i, step := range steps {
 			evals := step.Evals
 			merkleProof := step.MerkleProof
 			arityBits := params.ReductionArityBits[i]

 			arity := 1 << arityBits
 			codewordLenBits -= int(arityBits)

 			if len(evals) != arity {
 				panic("len evals doesn't match arity")
 			}

 			if len(merkleProof.Siblings)+int(capHeight) != codewordLenBits {
 				panic("len merkleProof doesn't match codewordLenBits")
 			}
 		}
 	}

 	if len(finalPoly.Coeffs) != params.FinalPolyLen() {
 		panic("len finalPoly doesn't match params FinalPolyLen")
 	}
 }
--- a/goldilocks/base.go
+++ b/goldilocks/base.go
@ -131,9 +131,8 @@ func (p *Chip) MulAdd(a Variable, b Variable, c Variable) Variable {
 	quotient := NewVariable(result[0])
 	remainder := NewVariable(result[1])

 	lhs := p.api.Mul(a.Limb, b.Limb)
 	lhs = p.api.Add(lhs, c.Limb)
 	rhs := p.api.Add(p.api.Mul(quotient.Limb, MODULUS), remainder.Limb)
 	lhs := p.api.MulAcc(c.Limb, a.Limb, b.Limb)
 	rhs := p.api.MulAcc(remainder.Limb, MODULUS, quotient.Limb)
 	p.api.AssertIsEqual(lhs, rhs)

 	p.RangeCheck(quotient)
@ -144,7 +143,7 @@ func (p *Chip) MulAdd(a Variable, b Variable, c Variable) Variable {
 // Multiplies two field elements and adds a field element such that x * y + z = c within the
 // Golidlocks field without reducing.
 func (p *Chip) MulAddNoReduce(a Variable, b Variable, c Variable) Variable {
 	return p.AddNoReduce(p.MulNoReduce(a, b), c)
 	return NewVariable(p.api.MulAcc(c.Limb, a.Limb, b.Limb))
 }

 // The hint used to compute MulAdd.
--- a/plonk/plonk.go
+++ b/plonk/plonk.go
@ -25,8 +25,6 @@ type PlonkChip struct {
 }

 func NewPlonkChip(api frontend.API, commonData types.CommonCircuitData) *PlonkChip {
 	// TODO:  Should degreeBits be verified that it fits within the field and that degree is within uint64?

 	// Create the gates based on commonData GateIds
 	createdGates := []gates.Gate{}
 	for _, gateId := range commonData.GateIds {
--- a/poseidon/bn254.go
+++ b/poseidon/bn254.go
@ -47,6 +47,9 @@ func (c *BN254Chip) HashNoPad(input []gl.Variable) BN254HashOut {
 		frontend.Variable(0),
 	}

 	two_to_32 := new(big.Int).SetInt64(1 << 32)
 	two_to_64 := new(big.Int).Mul(two_to_32, two_to_32)

 	for i := 0; i < len(input); i += BN254_SPONGE_RATE * 3 {
 		endI := c.min(len(input), i+BN254_SPONGE_RATE*3)
 		rateChunk := input[i:endI]
@ -54,13 +57,12 @@ func (c *BN254Chip) HashNoPad(input []gl.Variable) BN254HashOut {
 			endJ := c.min(len(rateChunk), j+3)
 			bn254Chunk := rateChunk[j:endJ]

 			bits := []frontend.Variable{}
 			inter := frontend.Variable(0)
 			for k := 0; k < len(bn254Chunk); k++ {
 				bn254Chunk[k] = c.gl.Reduce(bn254Chunk[k])
 				bits = append(bits, c.api.ToBinary(bn254Chunk[k].Limb, 64)...)
 				inter = c.api.MulAcc(inter, bn254Chunk[k].Limb, new(big.Int).Exp(two_to_64, big.NewInt(int64(k)), nil))
 			}

 			state[stateIdx+1] = c.api.FromBinary(bits...)
 			state[stateIdx+1] = inter
 		}

 		state = c.Poseidon(state)
@ -75,7 +77,7 @@ func (c *BN254Chip) HashOrNoop(input []gl.Variable) BN254HashOut {

 		alpha := new(big.Int).SetInt64(1 << 32)
 		for i, inputElement := range input {
 			returnVal = c.api.Add(returnVal, c.api.Mul(inputElement, alpha.Exp(alpha, big.NewInt(int64(i)), nil)))
 			returnVal = c.api.MulAcc(returnVal, inputElement, alpha.Exp(alpha, big.NewInt(int64(i)), nil))
 		}

 		return BN254HashOut(returnVal)
@ -145,16 +147,13 @@ func (c *BN254Chip) partialRounds(state BN254State) BN254State {
 		state[0] = c.exp5(state[0])
 		state[0] = c.api.Add(state[0], cConstants[(BN254_FULL_ROUNDS/2+1)*BN254_SPONGE_WIDTH+i])

 		var mul frontend.Variable
 		newState0 := frontend.Variable(0)
 		for j := 0; j < BN254_SPONGE_WIDTH; j++ {
 			mul = c.api.Mul(sConstants[(BN254_SPONGE_WIDTH*2-1)*i+j], state[j])
 			newState0 = c.api.Add(newState0, mul)
 			newState0 = c.api.MulAcc(newState0, sConstants[(BN254_SPONGE_WIDTH*2-1)*i+j], state[j])
 		}

 		for k := 1; k < BN254_SPONGE_WIDTH; k++ {
 			mul = c.api.Mul(state[0], sConstants[(BN254_SPONGE_WIDTH*2-1)*i+BN254_SPONGE_WIDTH+k-1])
 			state[k] = c.api.Add(state[k], mul)
 			state[k] = c.api.MulAcc(state[k], state[0], sConstants[(BN254_SPONGE_WIDTH*2-1)*i+BN254_SPONGE_WIDTH+k-1])
 		}
 		state[0] = newState0
 	}
@ -186,7 +185,6 @@ func (c *BN254Chip) exp5state(state BN254State) BN254State {
 }

 func (c *BN254Chip) mix(state_ BN254State, constantMatrix [][]*big.Int) BN254State {
 	var mul frontend.Variable
 	var result BN254State

 	for i := 0; i < BN254_SPONGE_WIDTH; i++ {
@ -195,8 +193,7 @@ func (c *BN254Chip) mix(state_ BN254State, constantMatrix [][]*big.Int) BN254Sta

 	for i := 0; i < BN254_SPONGE_WIDTH; i++ {
 		for j := 0; j < BN254_SPONGE_WIDTH; j++ {
 			mul = c.api.Mul(constantMatrix[j][i], state_[j])
 			result[i] = c.api.Add(result[i], mul)
 			result[i] = c.api.MulAcc(result[i], constantMatrix[j][i], state_[j])
 		}
 	}

--- a/types/types.go
+++ b/types/types.go
@ -1,13 +1,17 @@
 package types

 import "github.com/succinctlabs/gnark-plonky2-verifier/plonk/gates"
 import (
 	"github.com/succinctlabs/gnark-plonky2-verifier/plonk/gates"
 )

 type FriConfig struct {
 	RateBits        uint64
 	CapHeight       uint64
 	ProofOfWorkBits uint64
 	NumQueryRounds  uint64
 	// TODO: add FriReductionStrategy
 	// Note that we do not need `reduction_strategy` of type FriReductionStrategy as the plonky2 FriConfig has.
 	// reduction_strategy is only used for computing `reduction_arity_bits`, which is serialized in the
 	// CommonCircuitData.
 }

 func (fc *FriConfig) Rate() float64 {
@ -21,6 +25,40 @@ type FriParams struct {
 	ReductionArityBits []uint64
 }

 func (p *FriParams) TotalArities() int {
 	res := 0
 	for _, b := range p.ReductionArityBits {
 		res += int(b)
 	}
 	return res
 }

 func (p *FriParams) MaxArityBits() int {
 	res := 0
 	for _, b := range p.ReductionArityBits {
 		if int(b) > res {
 			res = int(b)
 		}
 	}
 	return res
 }

 func (p *FriParams) LdeBits() int {
 	return int(p.DegreeBits + p.Config.RateBits)
 }

 func (p *FriParams) LdeSize() int {
 	return 1 << p.LdeBits()
 }

 func (p *FriParams) FinalPolyBits() int {
 	return int(p.DegreeBits) - p.TotalArities()
 }

 func (p *FriParams) FinalPolyLen() int {
 	return int(1 << p.FinalPolyBits())
 }

 type CircuitConfig struct {
 	NumWires                uint64
 	NumRoutedWires          uint64