arnaucube
/
hermez-node
mirror of https://github.com/arnaucube/hermez-node.git


								package coordinator


								import (

									"context"

									"database/sql"

									"fmt"

									"math/big"

									"sync"

									"time"


									"github.com/hermeznetwork/hermez-node/batchbuilder"

									"github.com/hermeznetwork/hermez-node/common"

									"github.com/hermeznetwork/hermez-node/db/historydb"

									"github.com/hermeznetwork/hermez-node/db/l2db"

									"github.com/hermeznetwork/hermez-node/eth"

									"github.com/hermeznetwork/hermez-node/log"

									"github.com/hermeznetwork/hermez-node/prover"

									"github.com/hermeznetwork/hermez-node/synchronizer"

									"github.com/hermeznetwork/hermez-node/txselector"

									"github.com/hermeznetwork/tracerr"

								)


								type statsVars struct {

									Stats synchronizer.Stats

									Vars  synchronizer.SCVariablesPtr

								}


								type state struct {

									batchNum                     common.BatchNum

									lastScheduledL1BatchBlockNum int64

									lastForgeL1TxsNum            int64

									lastSlotForged               int64

								}


								// Pipeline manages the forging of batches with parallel server proofs

								type Pipeline struct {

									num    int

									cfg    Config

									consts synchronizer.SCConsts


									// state

									state         state

									started       bool

									rw            sync.RWMutex

									errAtBatchNum common.BatchNum

									lastForgeTime time.Time


									proversPool           *ProversPool

									provers               []prover.Client

									coord                 *Coordinator

									txManager             *TxManager

									historyDB             *historydb.HistoryDB

									l2DB                  *l2db.L2DB

									txSelector            *txselector.TxSelector

									batchBuilder          *batchbuilder.BatchBuilder

									mutexL2DBUpdateDelete *sync.Mutex

									purger                *Purger


									stats       synchronizer.Stats

									vars        synchronizer.SCVariables

									statsVarsCh chan statsVars


									ctx    context.Context

									wg     sync.WaitGroup

									cancel context.CancelFunc

								}


								func (p *Pipeline) setErrAtBatchNum(batchNum common.BatchNum) {

									p.rw.Lock()

									defer p.rw.Unlock()

									p.errAtBatchNum = batchNum

								}


								func (p *Pipeline) getErrAtBatchNum() common.BatchNum {

									p.rw.RLock()

									defer p.rw.RUnlock()

									return p.errAtBatchNum

								}


								// NewPipeline creates a new Pipeline

								func NewPipeline(ctx context.Context,

									cfg Config,

									num int, // Pipeline sequential number

									historyDB *historydb.HistoryDB,

									l2DB *l2db.L2DB,

									txSelector *txselector.TxSelector,

									batchBuilder *batchbuilder.BatchBuilder,

									mutexL2DBUpdateDelete *sync.Mutex,

									purger *Purger,

									coord *Coordinator,

									txManager *TxManager,

									provers []prover.Client,

									scConsts *synchronizer.SCConsts,

								) (*Pipeline, error) {

									proversPool := NewProversPool(len(provers))

									proversPoolSize := 0

									for _, prover := range provers {

										if err := prover.WaitReady(ctx); err != nil {

											log.Errorw("prover.WaitReady", "err", err)

										} else {

											proversPool.Add(ctx, prover)

											proversPoolSize++

										}

									}

									if proversPoolSize == 0 {

										return nil, tracerr.Wrap(fmt.Errorf("no provers in the pool"))

									}

									return &Pipeline{

										num:                   num,

										cfg:                   cfg,

										historyDB:             historyDB,

										l2DB:                  l2DB,

										txSelector:            txSelector,

										batchBuilder:          batchBuilder,

										provers:               provers,

										proversPool:           proversPool,

										mutexL2DBUpdateDelete: mutexL2DBUpdateDelete,

										purger:                purger,

										coord:                 coord,

										txManager:             txManager,

										consts:                *scConsts,

										statsVarsCh:           make(chan statsVars, queueLen),

									}, nil

								}


								// SetSyncStatsVars is a thread safe method to sets the synchronizer Stats

								func (p *Pipeline) SetSyncStatsVars(ctx context.Context, stats *synchronizer.Stats, vars *synchronizer.SCVariablesPtr) {

									select {

									case p.statsVarsCh <- statsVars{Stats: *stats, Vars: *vars}:

									case <-ctx.Done():

									}

								}


								// reset pipeline state

								func (p *Pipeline) reset(batchNum common.BatchNum,

									stats *synchronizer.Stats, vars *synchronizer.SCVariables) error {

									p.state = state{

										batchNum:                     batchNum,

										lastForgeL1TxsNum:            stats.Sync.LastForgeL1TxsNum,

										lastScheduledL1BatchBlockNum: 0,

										lastSlotForged:               -1,

									}

									p.stats = *stats

									p.vars = *vars


									// Reset the StateDB in TxSelector and BatchBuilder from the

									// synchronizer only if the checkpoint we reset from either:

									// a. Doesn't exist in the TxSelector/BatchBuilder

									// b. The batch has already been synced by the synchronizer and has a

									//    different MTRoot than the BatchBuilder

									// Otherwise, reset from the local checkpoint.


									// First attempt to reset from local checkpoint if such checkpoint exists

									existsTxSelector, err := p.txSelector.LocalAccountsDB().CheckpointExists(p.state.batchNum)

									if err != nil {

										return tracerr.Wrap(err)

									}

									fromSynchronizerTxSelector := !existsTxSelector

									if err := p.txSelector.Reset(p.state.batchNum, fromSynchronizerTxSelector); err != nil {

										return tracerr.Wrap(err)

									}

									existsBatchBuilder, err := p.batchBuilder.LocalStateDB().CheckpointExists(p.state.batchNum)

									if err != nil {

										return tracerr.Wrap(err)

									}

									fromSynchronizerBatchBuilder := !existsBatchBuilder

									if err := p.batchBuilder.Reset(p.state.batchNum, fromSynchronizerBatchBuilder); err != nil {

										return tracerr.Wrap(err)

									}


									// After reset, check that if the batch exists in the historyDB, the

									// stateRoot matches with the local one, if not, force a reset from

									// synchronizer

									batch, err := p.historyDB.GetBatch(p.state.batchNum)

									if tracerr.Unwrap(err) == sql.ErrNoRows {

										// nothing to do

									} else if err != nil {

										return tracerr.Wrap(err)

									} else {

										localStateRoot := p.batchBuilder.LocalStateDB().MT.Root().BigInt()

										if batch.StateRoot.Cmp(localStateRoot) != 0 {

											log.Debugw("localStateRoot (%v) != historyDB stateRoot (%v).  "+

												"Forcing reset from Synchronizer", localStateRoot, batch.StateRoot)

											// StateRoot from synchronizer doesn't match StateRoot

											// from batchBuilder, force a reset from synchronizer

											if err := p.txSelector.Reset(p.state.batchNum, true); err != nil {

												return tracerr.Wrap(err)

											}

											if err := p.batchBuilder.Reset(p.state.batchNum, true); err != nil {

												return tracerr.Wrap(err)

											}

										}

									}

									return nil

								}


								func (p *Pipeline) syncSCVars(vars synchronizer.SCVariablesPtr) {

									updateSCVars(&p.vars, vars)

								}


								// handleForgeBatch waits for an available proof server, calls p.forgeBatch to

								// forge the batch and get the zkInputs, and then  sends the zkInputs to the

								// selected proof server so that the proof computation begins.

								func (p *Pipeline) handleForgeBatch(ctx context.Context,

									batchNum common.BatchNum) (batchInfo *BatchInfo, err error) {

									// 1. Wait for an available serverProof (blocking call)

									serverProof, err := p.proversPool.Get(ctx)

									if ctx.Err() != nil {

										return nil, ctx.Err()

									} else if err != nil {

										log.Errorw("proversPool.Get", "err", err)

										return nil, err

									}

									defer func() {

										// If we encounter any error (notice that this function returns

										// errors to notify that a batch is not forged not only because

										// of unexpected errors but also due to benign causes), add the

										// serverProof back to the pool

										if err != nil {

											p.proversPool.Add(ctx, serverProof)

										}

									}()


									// 2. Forge the batch internally (make a selection of txs and prepare

									// all the smart contract arguments)

									p.mutexL2DBUpdateDelete.Lock()

									batchInfo, err = p.forgeBatch(batchNum)

									p.mutexL2DBUpdateDelete.Unlock()

									if ctx.Err() != nil {

										return nil, ctx.Err()

									} else if err != nil {

										if tracerr.Unwrap(err) == errLastL1BatchNotSynced {

											log.Warnw("forgeBatch: scheduled L1Batch too early", "err", err,

												"lastForgeL1TxsNum", p.state.lastForgeL1TxsNum,

												"syncLastForgeL1TxsNum", p.stats.Sync.LastForgeL1TxsNum)

										} else if tracerr.Unwrap(err) == errForgeNoTxsBeforeDelay ||

											tracerr.Unwrap(err) == errForgeBeforeDelay {

											// no log

										} else {

											log.Errorw("forgeBatch", "err", err)

										}

										return nil, err

									}


									// 3. Send the ZKInputs to the proof server

									batchInfo.ServerProof = serverProof

									if err := p.sendServerProof(ctx, batchInfo); ctx.Err() != nil {

										return nil, ctx.Err()

									} else if err != nil {

										log.Errorw("sendServerProof", "err", err)

										return nil, err

									}

									return batchInfo, nil

								}


								// Start the forging pipeline

								func (p *Pipeline) Start(batchNum common.BatchNum,

									stats *synchronizer.Stats, vars *synchronizer.SCVariables) error {

									if p.started {

										log.Fatal("Pipeline already started")

									}

									p.started = true


									if err := p.reset(batchNum, stats, vars); err != nil {

										return tracerr.Wrap(err)

									}

									p.ctx, p.cancel = context.WithCancel(context.Background())


									queueSize := 1

									batchChSentServerProof := make(chan *BatchInfo, queueSize)


									p.wg.Add(1)

									go func() {

										timer := time.NewTimer(zeroDuration)

										for {

											select {

											case <-p.ctx.Done():

												log.Info("Pipeline forgeBatch loop done")

												p.wg.Done()

												return

											case statsVars := <-p.statsVarsCh:

												p.stats = statsVars.Stats

												p.syncSCVars(statsVars.Vars)

											case <-timer.C:

												timer.Reset(p.cfg.ForgeRetryInterval)

												// Once errAtBatchNum != 0, we stop forging

												// batches because there's been an error and we

												// wait for the pipeline to be stopped.

												if p.getErrAtBatchNum() != 0 {

													continue

												}

												batchNum = p.state.batchNum + 1

												batchInfo, err := p.handleForgeBatch(p.ctx, batchNum)

												if p.ctx.Err() != nil {

													continue

												} else if tracerr.Unwrap(err) == errLastL1BatchNotSynced ||

													tracerr.Unwrap(err) == errForgeNoTxsBeforeDelay ||

													tracerr.Unwrap(err) == errForgeBeforeDelay {

													continue

												} else if err != nil {

													p.setErrAtBatchNum(batchNum)

													p.coord.SendMsg(p.ctx, MsgStopPipeline{

														Reason: fmt.Sprintf(

															"Pipeline.handleForgBatch: %v", err),

														FailedBatchNum: batchNum,

													})

													continue

												}

												p.lastForgeTime = time.Now()


												p.state.batchNum = batchNum

												select {

												case batchChSentServerProof <- batchInfo:

												case <-p.ctx.Done():

												}

												if !timer.Stop() {

													<-timer.C

												}

												timer.Reset(zeroDuration)

											}

										}

									}()


									p.wg.Add(1)

									go func() {

										for {

											select {

											case <-p.ctx.Done():

												log.Info("Pipeline waitServerProofSendEth loop done")

												p.wg.Done()

												return

											case batchInfo := <-batchChSentServerProof:

												// Once errAtBatchNum != 0, we stop forging

												// batches because there's been an error and we

												// wait for the pipeline to be stopped.

												if p.getErrAtBatchNum() != 0 {

													continue

												}

												err := p.waitServerProof(p.ctx, batchInfo)

												if p.ctx.Err() != nil {

													continue

												} else if err != nil {

													log.Errorw("waitServerProof", "err", err)

													p.setErrAtBatchNum(batchInfo.BatchNum)

													p.coord.SendMsg(p.ctx, MsgStopPipeline{

														Reason: fmt.Sprintf(

															"Pipeline.waitServerProof: %v", err),

														FailedBatchNum: batchInfo.BatchNum,

													})

													continue

												}

												// We are done with this serverProof, add it back to the pool

												p.proversPool.Add(p.ctx, batchInfo.ServerProof)

												p.txManager.AddBatch(p.ctx, batchInfo)

											}

										}

									}()

									return nil

								}


								// Stop the forging pipeline

								func (p *Pipeline) Stop(ctx context.Context) {

									if !p.started {

										log.Fatal("Pipeline already stopped")

									}

									p.started = false

									log.Info("Stopping Pipeline...")

									p.cancel()

									p.wg.Wait()

									for _, prover := range p.provers {

										if err := prover.Cancel(ctx); ctx.Err() != nil {

											continue

										} else if err != nil {

											log.Errorw("prover.Cancel", "err", err)

										}

									}

								}


								// sendServerProof sends the circuit inputs to the proof server

								func (p *Pipeline) sendServerProof(ctx context.Context, batchInfo *BatchInfo) error {

									p.cfg.debugBatchStore(batchInfo)


									// 7. Call the selected idle server proof with BatchBuilder output,

									// save server proof info for batchNum

									if err := batchInfo.ServerProof.CalculateProof(ctx, batchInfo.ZKInputs); err != nil {

										return tracerr.Wrap(err)

									}

									return nil

								}


								// forgeBatch forges the batchNum batch.

								func (p *Pipeline) forgeBatch(batchNum common.BatchNum) (batchInfo *BatchInfo, err error) {

									// remove transactions from the pool that have been there for too long

									_, err = p.purger.InvalidateMaybe(p.l2DB, p.txSelector.LocalAccountsDB(),

										p.stats.Sync.LastBlock.Num, int64(batchNum))

									if err != nil {

										return nil, tracerr.Wrap(err)

									}

									_, err = p.purger.PurgeMaybe(p.l2DB, p.stats.Sync.LastBlock.Num, int64(batchNum))

									if err != nil {

										return nil, tracerr.Wrap(err)

									}

									// Structure to accumulate data and metadata of the batch

									now := time.Now()

									batchInfo = &BatchInfo{PipelineNum: p.num, BatchNum: batchNum}

									batchInfo.Debug.StartTimestamp = now

									batchInfo.Debug.StartBlockNum = p.stats.Eth.LastBlock.Num + 1


									selectionCfg := &txselector.SelectionConfig{

										MaxL1UserTxs:      common.RollupConstMaxL1UserTx,

										TxProcessorConfig: p.cfg.TxProcessorConfig,

									}


									var poolL2Txs []common.PoolL2Tx

									var discardedL2Txs []common.PoolL2Tx

									var l1UserTxsExtra, l1CoordTxs []common.L1Tx

									var auths [][]byte

									var coordIdxs []common.Idx


									// Check if the slot is not yet fulfilled

									slotCommitted := false

									if p.stats.Sync.Auction.CurrentSlot.ForgerCommitment ||

										p.stats.Sync.Auction.CurrentSlot.SlotNum == p.state.lastSlotForged {

										slotCommitted = true

									}


									// If we haven't reached the ForgeDelay, skip forging the batch

									if slotCommitted && now.Sub(p.lastForgeTime) < p.cfg.ForgeDelay {

										return nil, errForgeBeforeDelay

									}


									// 1. Decide if we forge L2Tx or L1+L2Tx

									if p.shouldL1L2Batch(batchInfo) {

										batchInfo.L1Batch = true

										if p.state.lastForgeL1TxsNum != p.stats.Sync.LastForgeL1TxsNum {

											return nil, tracerr.Wrap(errLastL1BatchNotSynced)

										}

										// 2a: L1+L2 txs

										l1UserTxs, err := p.historyDB.GetUnforgedL1UserTxs(p.state.lastForgeL1TxsNum + 1)

										if err != nil {

											return nil, tracerr.Wrap(err)

										}

										coordIdxs, auths, l1UserTxsExtra, l1CoordTxs, poolL2Txs, discardedL2Txs, err =

											p.txSelector.GetL1L2TxSelection(selectionCfg, l1UserTxs)

										if err != nil {

											return nil, tracerr.Wrap(err)

										}

									} else {

										// 2b: only L2 txs

										coordIdxs, auths, l1CoordTxs, poolL2Txs, discardedL2Txs, err =

											p.txSelector.GetL2TxSelection(selectionCfg)

										if err != nil {

											return nil, tracerr.Wrap(err)

										}

										l1UserTxsExtra = nil

									}


									// If there are no txs to forge, no l1UserTxs in the open queue to

									// freeze, and we haven't reached the ForgeNoTxsDelay, skip forging the

									// batch.

									if slotCommitted && now.Sub(p.lastForgeTime) < p.cfg.ForgeNoTxsDelay {

										noTxs := false

										if len(l1UserTxsExtra) == 0 && len(l1CoordTxs) == 0 && len(poolL2Txs) == 0 {

											if batchInfo.L1Batch {

												// Query the number of unforged L1UserTxs

												// (either in a open queue or in a frozen

												// not-yet-forged queue).

												count, err := p.historyDB.GetUnforgedL1UserTxsCount()

												if err != nil {

													return nil, tracerr.Wrap(err)

												}

												// If there are future L1UserTxs, we forge a

												// batch to advance the queues to be able to

												// forge the L1UserTxs in the future.

												// Otherwise, skip.

												if count == 0 {

													noTxs = true

												}

											} else {

												noTxs = true

											}

										}

										if noTxs {

											if err := p.txSelector.Reset(batchInfo.BatchNum-1, false); err != nil {

												return nil, tracerr.Wrap(err)

											}

											return nil, errForgeNoTxsBeforeDelay

										}

									}


									if batchInfo.L1Batch {

										p.state.lastScheduledL1BatchBlockNum = p.stats.Eth.LastBlock.Num + 1

										p.state.lastForgeL1TxsNum++

									}


									// 3.  Save metadata from TxSelector output for BatchNum

									batchInfo.L1UserTxsExtra = l1UserTxsExtra

									batchInfo.L1CoordTxs = l1CoordTxs

									batchInfo.L1CoordinatorTxsAuths = auths

									batchInfo.CoordIdxs = coordIdxs

									batchInfo.VerifierIdx = p.cfg.VerifierIdx


									if err := p.l2DB.StartForging(common.TxIDsFromPoolL2Txs(poolL2Txs), batchInfo.BatchNum); err != nil {

										return nil, tracerr.Wrap(err)

									}

									if err := p.l2DB.UpdateTxsInfo(discardedL2Txs); err != nil {

										return nil, tracerr.Wrap(err)

									}


									// Invalidate transactions that become invalid beause of

									// the poolL2Txs selected.  Will mark as invalid the txs that have a

									// (fromIdx, nonce) which already appears in the selected txs (includes

									// all the nonces smaller than the current one)

									err = p.l2DB.InvalidateOldNonces(idxsNonceFromPoolL2Txs(poolL2Txs), batchInfo.BatchNum)

									if err != nil {

										return nil, tracerr.Wrap(err)

									}


									// 4. Call BatchBuilder with TxSelector output

									configBatch := &batchbuilder.ConfigBatch{

										TxProcessorConfig: p.cfg.TxProcessorConfig,

									}

									zkInputs, err := p.batchBuilder.BuildBatch(coordIdxs, configBatch, l1UserTxsExtra,

										l1CoordTxs, poolL2Txs)

									if err != nil {

										return nil, tracerr.Wrap(err)

									}

									l2Txs, err := common.PoolL2TxsToL2Txs(poolL2Txs) // NOTE: This is a big uggly, find a better way

									if err != nil {

										return nil, tracerr.Wrap(err)

									}

									batchInfo.L2Txs = l2Txs


									// 5. Save metadata from BatchBuilder output for BatchNum

									batchInfo.ZKInputs = zkInputs

									batchInfo.Debug.Status = StatusForged

									p.cfg.debugBatchStore(batchInfo)

									log.Infow("Pipeline: batch forged internally", "batch", batchInfo.BatchNum)


									p.state.lastSlotForged = p.stats.Sync.Auction.CurrentSlot.SlotNum


									return batchInfo, nil

								}


								// waitServerProof gets the generated zkProof & sends it to the SmartContract

								func (p *Pipeline) waitServerProof(ctx context.Context, batchInfo *BatchInfo) error {

									proof, pubInputs, err := batchInfo.ServerProof.GetProof(ctx) // blocking call, until not resolved don't continue. Returns when the proof server has calculated the proof

									if err != nil {

										return tracerr.Wrap(err)

									}

									batchInfo.Proof = proof

									batchInfo.PublicInputs = pubInputs

									batchInfo.ForgeBatchArgs = prepareForgeBatchArgs(batchInfo)

									batchInfo.Debug.Status = StatusProof

									p.cfg.debugBatchStore(batchInfo)

									log.Infow("Pipeline: batch proof calculated", "batch", batchInfo.BatchNum)

									return nil

								}


								func (p *Pipeline) shouldL1L2Batch(batchInfo *BatchInfo) bool {

									// Take the lastL1BatchBlockNum as the biggest between the last

									// scheduled one, and the synchronized one.

									lastL1BatchBlockNum := p.state.lastScheduledL1BatchBlockNum

									if p.stats.Sync.LastL1BatchBlock > lastL1BatchBlockNum {

										lastL1BatchBlockNum = p.stats.Sync.LastL1BatchBlock

									}

									// Set Debug information

									batchInfo.Debug.LastScheduledL1BatchBlockNum = p.state.lastScheduledL1BatchBlockNum

									batchInfo.Debug.LastL1BatchBlock = p.stats.Sync.LastL1BatchBlock

									batchInfo.Debug.LastL1BatchBlockDelta = p.stats.Eth.LastBlock.Num + 1 - lastL1BatchBlockNum

									batchInfo.Debug.L1BatchBlockScheduleDeadline =

										int64(float64(p.vars.Rollup.ForgeL1L2BatchTimeout-1) * p.cfg.L1BatchTimeoutPerc)

									// Return true if we have passed the l1BatchTimeoutPerc portion of the

									// range before the l1batch timeout.

									return p.stats.Eth.LastBlock.Num+1-lastL1BatchBlockNum >=

										int64(float64(p.vars.Rollup.ForgeL1L2BatchTimeout-1)*p.cfg.L1BatchTimeoutPerc)

								}


								func prepareForgeBatchArgs(batchInfo *BatchInfo) *eth.RollupForgeBatchArgs {

									proof := batchInfo.Proof

									zki := batchInfo.ZKInputs

									return &eth.RollupForgeBatchArgs{

										NewLastIdx:            int64(zki.Metadata.NewLastIdxRaw),

										NewStRoot:             zki.Metadata.NewStateRootRaw.BigInt(),

										NewExitRoot:           zki.Metadata.NewExitRootRaw.BigInt(),

										L1UserTxs:             batchInfo.L1UserTxsExtra,

										L1CoordinatorTxs:      batchInfo.L1CoordTxs,

										L1CoordinatorTxsAuths: batchInfo.L1CoordinatorTxsAuths,

										L2TxsData:             batchInfo.L2Txs,

										FeeIdxCoordinator:     batchInfo.CoordIdxs,

										// Circuit selector

										VerifierIdx: batchInfo.VerifierIdx,

										L1Batch:     batchInfo.L1Batch,

										ProofA:      [2]*big.Int{proof.PiA[0], proof.PiA[1]},

										// Implementation of the verifier need a swap on the proofB vector

										ProofB: [2][2]*big.Int{

											{proof.PiB[0][1], proof.PiB[0][0]},

											{proof.PiB[1][1], proof.PiB[1][0]},

										},

										ProofC: [2]*big.Int{proof.PiC[0], proof.PiC[1]},

									}

								}