diff --git a/.gitignore b/.gitignore
index 8045102..142f307 100644
--- a/.gitignore
+++ b/.gitignore
@@ -15,4 +15,7 @@
 # vendor/
 
 gnark-ed25519
-gnark.pprof
\ No newline at end of file
+gnark.pprof
+
+# Output from pprof visualization
+verifier.png
\ No newline at end of file
diff --git a/README.md b/README.md
index 874f1d2..6279013 100644
--- a/README.md
+++ b/README.md
@@ -19,20 +19,14 @@ To run the benchmark,
 go run benchmark.go
 ```
 
-Here are relevant numbers from a benchmark ran on an M1 Max with 10 CPU cores.
+## Profiling
 
+First run the benchmark with profiling turned on
 ```
-11:04:08 INF compiling circuit
-11:04:08 INF parsed circuit inputs nbPublic=0 nbSecret=0
-11:12:30 INF building constraint system nbConstraints=6740784
-Generating witness 2023-03-28 11:12:42.702566 -0700 PDT m=+514.333410376
-Running circuit setup 2023-03-28 11:12:42.702666 -0700 PDT m=+514.333509834
-Creating proof 2023-03-28 11:18:58.881518 -0700 PDT m=+890.519971543
-11:18:59 DBG constraint system solver done backend=groth16 nbConstraints=6740784 took=675.361625
-11:19:10 DBG prover done backend=groth16 curve=bn254 nbConstraints=6740784 took=10512.664584
-Verifying proof 2023-03-28 11:19:10.169636 -0700 PDT m=+901.808314709
-11:19:10 DBG verifier done backend=groth16 curve=bn254 took=6.288792
-bn254 2023-03-28 11:19:10.175992 -0700 PDT m=+901.814670834
+go run benchmark.go -profile
 ```
 
-The circuit can be significantly optimized by using more efficient arithmetic for Goldilocks, among other things.
+Then use the following command to generate a visualization of the pprof
+```
+go tool pprof --png gnark.pprof > verifier.png
+```
\ No newline at end of file
diff --git a/benchmark.go b/benchmark.go
index 67efe6e..c2fb005 100644
--- a/benchmark.go
+++ b/benchmark.go
@@ -282,8 +282,8 @@ func groth16Proof(r1cs constraint.ConstraintSystem, circuitName string, dummy bo
 }
 
 func main() {
-	plonky2Circuit := flag.String("plonky2-circuit", "", "plonky2 circuit to benchmark")
-	proofSystem := flag.String("proof-system", "groth16", "proof system to benchmark")
+	plonky2Circuit := flag.String("plonky2-circuit", "step", "plonky2 circuit to benchmark")
+	proofSystem := flag.String("proof-system", "plonk", "proof system to benchmark")
 	profileCircuit := flag.Bool("profile", true, "profile the circuit")
 	dummySetup := flag.Bool("dummy", true, "use dummy setup")
 	saveArtifacts := flag.Bool("save", false, "save circuit artifacts")
diff --git a/fri/fri.go b/fri/fri.go
index 7fe5b43..80d300f 100644
--- a/fri/fri.go
+++ b/fri/fri.go
@@ -106,11 +106,15 @@ func (f *Chip) verifyMerkleProofToCapWithCapIndex(
 	currentDigest := f.poseidonBN254Chip.HashOrNoop(leafData)
 	for i, sibling := range proof.Siblings {
 		bit := leafIndexBits[i]
-		// TODO: Don't need to do two hashes by using a trick that the plonky2 verifier circuit does
-		// https://github.com/mir-protocol/plonky2/blob/973624f12d2d12d74422b3ea051358b9eaacb050/plonky2/src/gates/poseidon.rs#L298
-		leftHash := f.poseidonBN254Chip.TwoToOne(sibling, currentDigest)
-		rightHash := f.poseidonBN254Chip.TwoToOne(currentDigest, sibling)
-		currentDigest = f.api.Select(bit, leftHash, rightHash)
+
+		var inputs poseidon.BN254State
+		inputs[0] = frontend.Variable(0)
+		inputs[1] = frontend.Variable(0)
+		inputs[2] = f.api.Select(bit, sibling, currentDigest)
+		inputs[3] = f.api.Select(bit, currentDigest, sibling)
+		state := f.poseidonBN254Chip.Poseidon(inputs)
+
+		currentDigest = state[0]
 	}
 
 	// We assume that the cap_height is 4.  Create two levels of the Lookup2 circuit
@@ -152,29 +156,6 @@ func (f *Chip) verifyInitialProof(xIndexBits []frontend.Variable, proof *variabl
 	}
 }
 
-// / We decompose FRI query indices into bits without verifying that the decomposition given by
-// / the prover is the canonical one. In particular, if `x_index < 2^field_bits - p`, then the
-// / prover could supply the binary encoding of either `x_index` or `x_index + p`, since they are
-// / congruent mod `p`. However, this only occurs with probability
-// /     p_ambiguous = (2^field_bits - p) / p
-// / which is small for the field that we use in practice.
-// /
-// / In particular, the soundness error of one FRI query is roughly the codeword rate, which
-// / is much larger than this ambiguous-element probability given any reasonable parameters.
-// / Thus ambiguous elements contribute a negligible amount to soundness error.
-// /
-// / Here we compare the probabilities as a sanity check, to verify the claim above.
-func (f *Chip) assertNoncanonicalIndicesOK() {
-	numAmbiguousElems := uint64(math.MaxUint64) - goldilocks.Modulus().Uint64() + 1
-	queryError := f.friParams.Config.Rate()
-	pAmbiguous := float64(numAmbiguousElems) / float64(goldilocks.Modulus().Uint64())
-
-	// TODO:  Check that pAmbiguous value is the same as the one in plonky2 verifier
-	if pAmbiguous >= queryError*1e-5 {
-		panic("A non-negligible portion of field elements are in the range that permits non-canonical encodings. Need to do more analysis or enforce canonical encodings.")
-	}
-}
-
 func (f *Chip) expFromBitsConstBase(
 	base goldilocks.Element,
 	exponentBits []frontend.Variable,
@@ -209,7 +190,7 @@ func (f *Chip) calculateSubgroupX(
 ) gl.Variable {
 	// Compute x from its index
 	// `subgroup_x` is `subgroup[x_index]`, i.e., the actual field element in the domain.
-	// TODO - Make these as global values
+	// OPTIMIZE - Make these as global values
 	g := gl.NewVariable(gl.MULTIPLICATIVE_GROUP_GENERATOR.Uint64())
 	base := gl.PrimitiveRootOfUnity(nLog)
 
@@ -343,7 +324,7 @@ func (f *Chip) computeEvaluation(
 
 	// The evaluation vector needs to be reordered first.  Permute the evals array such that each
 	// element's new index is the bit reverse of it's original index.
-	// TODO:  Optimization - Since the size of the evals array should be constant (e.g. 2^arityBits),
+	// OPTIMIZE - Since the size of the evals array should be constant (e.g. 2^arityBits),
 	//        we can just hard code the permutation.
 	permutedEvals := make([]gl.QuadraticExtensionVariable, len(evals))
 	for i := uint8(0); i < uint8(len(evals)); i++ {
@@ -363,14 +344,14 @@ func (f *Chip) computeEvaluation(
 	xPoints := make([]gl.QuadraticExtensionVariable, len(evals))
 	yPoints := permutedEvals
 
-	// TODO: Make g_F a constant
+	// OPTIMIZE: Make g_F a constant
 	g_F := gl.NewVariable(g.Uint64()).ToQuadraticExtension()
 	xPoints[0] = gl.QuadraticExtensionVariable{cosetStart, gl.Zero()}
 	for i := 1; i < len(evals); i++ {
 		xPoints[i] = f.gl.MulExtension(xPoints[i-1], g_F)
 	}
 
-	// TODO:  This is n^2.  Is there a way to do this better?
+	// OPTIMIZE:  This is n^2.  Is there a way to do this better?
 	// Compute the barycentric weights
 	barycentricWeights := make([]gl.QuadraticExtensionVariable, len(xPoints))
 	for i := 0; i < len(xPoints); i++ {
@@ -385,7 +366,7 @@ func (f *Chip) computeEvaluation(
 			}
 		}
 		// Take the inverse of the barycentric weights
-		// TODO: Can provide a witness to this value
+		// OPTIMIZE: Can provide a witness to this value
 		barycentricWeights[i] = f.gl.InverseExtension(barycentricWeights[i])
 	}
 
@@ -403,7 +384,9 @@ func (f *Chip) verifyQueryRound(
 	nLog uint64,
 	roundProof *variables.FriQueryRound,
 ) {
-	f.assertNoncanonicalIndicesOK()
+	// Note assertNoncanonicalIndicesOK does not add any constraints, it's a sanity check on the config
+	assertNoncanonicalIndicesOK(*f.friParams)
+
 	xIndex = f.gl.Reduce(xIndex)
 	xIndexBits := f.api.ToBinary(xIndex.Limb, 64)[0 : f.friParams.DegreeBits+f.friParams.Config.RateBits]
 	capIndexBits := xIndexBits[len(xIndexBits)-int(f.friParams.Config.CapHeight):]
@@ -511,21 +494,18 @@ func (f *Chip) VerifyFriProof(
 	initialMerkleCaps []variables.FriMerkleCap,
 	friProof *variables.FriProof,
 ) {
-	// TODO:  Check fri config
-	/* if let Some(max_arity_bits) = params.max_arity_bits() {
-		self.check_recursion_config::<C>(max_arity_bits);
-	}
-
-	debug_assert_eq!(
-		params.final_poly_len(),
-		proof.final_poly.len(),
-		"Final polynomial has wrong degree."
-	); */
+	// Not adding any constraints but a sanity check on the proof shape matching the friParams (constant).
+	validateFriProofShape(friProof, instance, f.friParams)
 
 	// Check POW
-
 	f.assertLeadingZeros(friChallenges.FriPowResponse, f.friParams.Config)
 
+	// Check that parameters are coherent. Not adding any constraints but a sanity check
+	// on the proof shape matching the friParams.
+	if int(f.friParams.Config.NumQueryRounds) != len(friProof.QueryRoundProofs) {
+		panic("Number of query rounds does not match config.")
+	}
+
 	precomputedReducedEvals := f.fromOpeningsAndAlpha(&openings, friChallenges.FriAlpha)
 
 	// Size of the LDE domain.
diff --git a/fri/fri_utils.go b/fri/fri_utils.go
index dccee7e..fe3b5a6 100644
--- a/fri/fri_utils.go
+++ b/fri/fri_utils.go
@@ -1,7 +1,11 @@
 package fri
 
 import (
+	"math"
+
+	"github.com/consensys/gnark-crypto/field/goldilocks"
 	"github.com/succinctlabs/gnark-plonky2-verifier/types"
+	"github.com/succinctlabs/gnark-plonky2-verifier/variables"
 )
 
 type PolynomialInfo struct {
@@ -146,3 +150,79 @@ func friAllPolys(c *types.CommonCircuitData) []PolynomialInfo {
 
 	return returnArr
 }
+
+// This does not add any constraints, it's just a sanity check on the friParams
+// It's a 1-1 port of assert_noncanonical_indices_ok from fri::recursive_verifier in plonky2
+func assertNoncanonicalIndicesOK(friParams types.FriParams) {
+	numAmbiguousElems := uint64(math.MaxUint64) - goldilocks.Modulus().Uint64() + 1
+	queryError := friParams.Config.Rate()
+	pAmbiguous := float64(numAmbiguousElems) / float64(goldilocks.Modulus().Uint64())
+	if pAmbiguous >= queryError*1e-5 {
+		panic("A non-negligible portion of field elements are in the range that permits non-canonical encodings. Need to do more analysis or enforce canonical encodings.")
+	}
+}
+
+// This does not add any constraints, it is just a sanity check on the shapes of the proof variable
+// and given FriParams. It's a 1-1 port of validate_fri_proof_shape from fri::validate_shape in plonky2
+func validateFriProofShape(proof *variables.FriProof, instance InstanceInfo, params *types.FriParams) {
+	const SALT_SIZE = 4
+
+	commitPhaseMerkleCaps := proof.CommitPhaseMerkleCaps
+	queryRoundProofs := proof.QueryRoundProofs
+	finalPoly := proof.FinalPoly
+
+	capHeight := params.Config.CapHeight
+	for _, cap := range commitPhaseMerkleCaps {
+		if 1<<capHeight != len(cap) {
+			panic("config cap_height does not match commit_phase_merkle_caps")
+		}
+	}
+
+	for _, queryRound := range queryRoundProofs {
+		initialTreesProof := queryRound.InitialTreesProof
+		steps := queryRound.Steps
+		if len(initialTreesProof.EvalsProofs) != len(instance.Oracles) {
+			panic("eval proofs length is not equal to instance oracles length")
+		}
+		for i, evalProof := range initialTreesProof.EvalsProofs {
+			leaf := evalProof.Elements
+			merkleProof := evalProof.MerkleProof
+			oracle := instance.Oracles[i]
+			salt_size := 0
+			if oracle.Blinding && params.Hiding {
+				salt_size = SALT_SIZE
+			}
+			if len(leaf) != (int(oracle.NumPolys) + salt_size) {
+				panic("eval proof leaf length doesn't match oracle info")
+			}
+			if len(merkleProof.Siblings)+int(capHeight) != params.LdeBits() {
+				panic("length of merkle proof + capHeight doesn't match lde_bits from params")
+			}
+		}
+		if len(steps) != len(params.ReductionArityBits) {
+			panic("length of steps != params.reduction_arity_bits")
+		}
+
+		codewordLenBits := params.LdeBits()
+		for i, step := range steps {
+			evals := step.Evals
+			merkleProof := step.MerkleProof
+			arityBits := params.ReductionArityBits[i]
+
+			arity := 1 << arityBits
+			codewordLenBits -= int(arityBits)
+
+			if len(evals) != arity {
+				panic("len evals doesn't match arity")
+			}
+
+			if len(merkleProof.Siblings)+int(capHeight) != codewordLenBits {
+				panic("len merkleProof doesn't match codewordLenBits")
+			}
+		}
+	}
+
+	if len(finalPoly.Coeffs) != params.FinalPolyLen() {
+		panic("len finalPoly doesn't match params FinalPolyLen")
+	}
+}
diff --git a/goldilocks/base.go b/goldilocks/base.go
index 393d19e..8c9e1d6 100644
--- a/goldilocks/base.go
+++ b/goldilocks/base.go
@@ -131,9 +131,8 @@ func (p *Chip) MulAdd(a Variable, b Variable, c Variable) Variable {
 	quotient := NewVariable(result[0])
 	remainder := NewVariable(result[1])
 
-	lhs := p.api.Mul(a.Limb, b.Limb)
-	lhs = p.api.Add(lhs, c.Limb)
-	rhs := p.api.Add(p.api.Mul(quotient.Limb, MODULUS), remainder.Limb)
+	lhs := p.api.MulAcc(c.Limb, a.Limb, b.Limb)
+	rhs := p.api.MulAcc(remainder.Limb, MODULUS, quotient.Limb)
 	p.api.AssertIsEqual(lhs, rhs)
 
 	p.RangeCheck(quotient)
@@ -144,7 +143,7 @@ func (p *Chip) MulAdd(a Variable, b Variable, c Variable) Variable {
 // Multiplies two field elements and adds a field element such that x * y + z = c within the
 // Golidlocks field without reducing.
 func (p *Chip) MulAddNoReduce(a Variable, b Variable, c Variable) Variable {
-	return p.AddNoReduce(p.MulNoReduce(a, b), c)
+	return NewVariable(p.api.MulAcc(c.Limb, a.Limb, b.Limb))
 }
 
 // The hint used to compute MulAdd.
diff --git a/plonk/plonk.go b/plonk/plonk.go
index 945611f..00cfc86 100644
--- a/plonk/plonk.go
+++ b/plonk/plonk.go
@@ -25,8 +25,6 @@ type PlonkChip struct {
 }
 
 func NewPlonkChip(api frontend.API, commonData types.CommonCircuitData) *PlonkChip {
-	// TODO:  Should degreeBits be verified that it fits within the field and that degree is within uint64?
-
 	// Create the gates based on commonData GateIds
 	createdGates := []gates.Gate{}
 	for _, gateId := range commonData.GateIds {
diff --git a/poseidon/bn254.go b/poseidon/bn254.go
index 48e4a21..77ac7ce 100644
--- a/poseidon/bn254.go
+++ b/poseidon/bn254.go
@@ -47,6 +47,9 @@ func (c *BN254Chip) HashNoPad(input []gl.Variable) BN254HashOut {
 		frontend.Variable(0),
 	}
 
+	two_to_32 := new(big.Int).SetInt64(1 << 32)
+	two_to_64 := new(big.Int).Mul(two_to_32, two_to_32)
+
 	for i := 0; i < len(input); i += BN254_SPONGE_RATE * 3 {
 		endI := c.min(len(input), i+BN254_SPONGE_RATE*3)
 		rateChunk := input[i:endI]
@@ -54,13 +57,12 @@ func (c *BN254Chip) HashNoPad(input []gl.Variable) BN254HashOut {
 			endJ := c.min(len(rateChunk), j+3)
 			bn254Chunk := rateChunk[j:endJ]
 
-			bits := []frontend.Variable{}
+			inter := frontend.Variable(0)
 			for k := 0; k < len(bn254Chunk); k++ {
-				bn254Chunk[k] = c.gl.Reduce(bn254Chunk[k])
-				bits = append(bits, c.api.ToBinary(bn254Chunk[k].Limb, 64)...)
+				inter = c.api.MulAcc(inter, bn254Chunk[k].Limb, new(big.Int).Exp(two_to_64, big.NewInt(int64(k)), nil))
 			}
 
-			state[stateIdx+1] = c.api.FromBinary(bits...)
+			state[stateIdx+1] = inter
 		}
 
 		state = c.Poseidon(state)
@@ -75,7 +77,7 @@ func (c *BN254Chip) HashOrNoop(input []gl.Variable) BN254HashOut {
 
 		alpha := new(big.Int).SetInt64(1 << 32)
 		for i, inputElement := range input {
-			returnVal = c.api.Add(returnVal, c.api.Mul(inputElement, alpha.Exp(alpha, big.NewInt(int64(i)), nil)))
+			returnVal = c.api.MulAcc(returnVal, inputElement, alpha.Exp(alpha, big.NewInt(int64(i)), nil))
 		}
 
 		return BN254HashOut(returnVal)
@@ -145,16 +147,13 @@ func (c *BN254Chip) partialRounds(state BN254State) BN254State {
 		state[0] = c.exp5(state[0])
 		state[0] = c.api.Add(state[0], cConstants[(BN254_FULL_ROUNDS/2+1)*BN254_SPONGE_WIDTH+i])
 
-		var mul frontend.Variable
 		newState0 := frontend.Variable(0)
 		for j := 0; j < BN254_SPONGE_WIDTH; j++ {
-			mul = c.api.Mul(sConstants[(BN254_SPONGE_WIDTH*2-1)*i+j], state[j])
-			newState0 = c.api.Add(newState0, mul)
+			newState0 = c.api.MulAcc(newState0, sConstants[(BN254_SPONGE_WIDTH*2-1)*i+j], state[j])
 		}
 
 		for k := 1; k < BN254_SPONGE_WIDTH; k++ {
-			mul = c.api.Mul(state[0], sConstants[(BN254_SPONGE_WIDTH*2-1)*i+BN254_SPONGE_WIDTH+k-1])
-			state[k] = c.api.Add(state[k], mul)
+			state[k] = c.api.MulAcc(state[k], state[0], sConstants[(BN254_SPONGE_WIDTH*2-1)*i+BN254_SPONGE_WIDTH+k-1])
 		}
 		state[0] = newState0
 	}
@@ -186,7 +185,6 @@ func (c *BN254Chip) exp5state(state BN254State) BN254State {
 }
 
 func (c *BN254Chip) mix(state_ BN254State, constantMatrix [][]*big.Int) BN254State {
-	var mul frontend.Variable
 	var result BN254State
 
 	for i := 0; i < BN254_SPONGE_WIDTH; i++ {
@@ -195,8 +193,7 @@ func (c *BN254Chip) mix(state_ BN254State, constantMatrix [][]*big.Int) BN254Sta
 
 	for i := 0; i < BN254_SPONGE_WIDTH; i++ {
 		for j := 0; j < BN254_SPONGE_WIDTH; j++ {
-			mul = c.api.Mul(constantMatrix[j][i], state_[j])
-			result[i] = c.api.Add(result[i], mul)
+			result[i] = c.api.MulAcc(result[i], constantMatrix[j][i], state_[j])
 		}
 	}
 
diff --git a/types/types.go b/types/types.go
index c206e1b..03bd38f 100644
--- a/types/types.go
+++ b/types/types.go
@@ -1,13 +1,17 @@
 package types
 
-import "github.com/succinctlabs/gnark-plonky2-verifier/plonk/gates"
+import (
+	"github.com/succinctlabs/gnark-plonky2-verifier/plonk/gates"
+)
 
 type FriConfig struct {
 	RateBits        uint64
 	CapHeight       uint64
 	ProofOfWorkBits uint64
 	NumQueryRounds  uint64
-	// TODO: add FriReductionStrategy
+	// Note that we do not need `reduction_strategy` of type FriReductionStrategy as the plonky2 FriConfig has.
+	// reduction_strategy is only used for computing `reduction_arity_bits`, which is serialized in the
+	// CommonCircuitData.
 }
 
 func (fc *FriConfig) Rate() float64 {
@@ -21,6 +25,40 @@ type FriParams struct {
 	ReductionArityBits []uint64
 }
 
+func (p *FriParams) TotalArities() int {
+	res := 0
+	for _, b := range p.ReductionArityBits {
+		res += int(b)
+	}
+	return res
+}
+
+func (p *FriParams) MaxArityBits() int {
+	res := 0
+	for _, b := range p.ReductionArityBits {
+		if int(b) > res {
+			res = int(b)
+		}
+	}
+	return res
+}
+
+func (p *FriParams) LdeBits() int {
+	return int(p.DegreeBits + p.Config.RateBits)
+}
+
+func (p *FriParams) LdeSize() int {
+	return 1 << p.LdeBits()
+}
+
+func (p *FriParams) FinalPolyBits() int {
+	return int(p.DegreeBits) - p.TotalArities()
+}
+
+func (p *FriParams) FinalPolyLen() int {
+	return int(1 << p.FinalPolyBits())
+}
+
 type CircuitConfig struct {
 	NumWires                uint64
 	NumRoutedWires          uint64