arnaucube
/
hermez-node
mirror of https://github.com/arnaucube/hermez-node.git

package coordinator

import (
	"context"
	"database/sql"
	"fmt"
	"math/big"
	"sync"
	"time"

	"github.com/hermeznetwork/hermez-node/batchbuilder"
	"github.com/hermeznetwork/hermez-node/common"
	"github.com/hermeznetwork/hermez-node/db/historydb"
	"github.com/hermeznetwork/hermez-node/db/l2db"
	"github.com/hermeznetwork/hermez-node/eth"
	"github.com/hermeznetwork/hermez-node/log"
	"github.com/hermeznetwork/hermez-node/prover"
	"github.com/hermeznetwork/hermez-node/synchronizer"
	"github.com/hermeznetwork/hermez-node/txselector"
	"github.com/hermeznetwork/tracerr"
)

type statsVars struct {
	Stats synchronizer.Stats
	Vars  synchronizer.SCVariablesPtr
}

type state struct {
	batchNum                     common.BatchNum
	lastScheduledL1BatchBlockNum int64
	lastForgeL1TxsNum            int64
}

// Pipeline manages the forging of batches with parallel server proofs
type Pipeline struct {
	num    int
	cfg    Config
	consts synchronizer.SCConsts

	// state
	state state
	// batchNum                     common.BatchNum
	// lastScheduledL1BatchBlockNum int64
	// lastForgeL1TxsNum            int64
	started       bool
	rw            sync.RWMutex
	errAtBatchNum common.BatchNum

	proversPool  *ProversPool
	provers      []prover.Client
	coord        *Coordinator
	txManager    *TxManager
	historyDB    *historydb.HistoryDB
	l2DB         *l2db.L2DB
	txSelector   *txselector.TxSelector
	batchBuilder *batchbuilder.BatchBuilder
	purger       *Purger

	stats       synchronizer.Stats
	vars        synchronizer.SCVariables
	statsVarsCh chan statsVars

	ctx    context.Context
	wg     sync.WaitGroup
	cancel context.CancelFunc
}

func (p *Pipeline) setErrAtBatchNum(batchNum common.BatchNum) {
	p.rw.Lock()
	defer p.rw.Unlock()
	p.errAtBatchNum = batchNum
}

func (p *Pipeline) getErrAtBatchNum() common.BatchNum {
	p.rw.RLock()
	defer p.rw.RUnlock()
	return p.errAtBatchNum
}

// NewPipeline creates a new Pipeline
func NewPipeline(ctx context.Context,
	cfg Config,
	num int, // Pipeline sequential number
	historyDB *historydb.HistoryDB,
	l2DB *l2db.L2DB,
	txSelector *txselector.TxSelector,
	batchBuilder *batchbuilder.BatchBuilder,
	purger *Purger,
	coord *Coordinator,
	txManager *TxManager,
	provers []prover.Client,
	scConsts *synchronizer.SCConsts,
) (*Pipeline, error) {
	proversPool := NewProversPool(len(provers))
	proversPoolSize := 0
	for _, prover := range provers {
		if err := prover.WaitReady(ctx); err != nil {
			log.Errorw("prover.WaitReady", "err", err)
		} else {
			proversPool.Add(ctx, prover)
			proversPoolSize++
		}
	}
	if proversPoolSize == 0 {
		return nil, tracerr.Wrap(fmt.Errorf("no provers in the pool"))
	}
	return &Pipeline{
		num:          num,
		cfg:          cfg,
		historyDB:    historyDB,
		l2DB:         l2DB,
		txSelector:   txSelector,
		batchBuilder: batchBuilder,
		provers:      provers,
		proversPool:  proversPool,
		purger:       purger,
		coord:        coord,
		txManager:    txManager,
		consts:       *scConsts,
		statsVarsCh:  make(chan statsVars, queueLen),
	}, nil
}

// SetSyncStatsVars is a thread safe method to sets the synchronizer Stats
func (p *Pipeline) SetSyncStatsVars(ctx context.Context, stats *synchronizer.Stats, vars *synchronizer.SCVariablesPtr) {
	select {
	case p.statsVarsCh <- statsVars{Stats: *stats, Vars: *vars}:
	case <-ctx.Done():
	}
}

// reset pipeline state
func (p *Pipeline) reset(batchNum common.BatchNum,
	stats *synchronizer.Stats, vars *synchronizer.SCVariables) error {
	p.state = state{
		batchNum:                     batchNum,
		lastForgeL1TxsNum:            stats.Sync.LastForgeL1TxsNum,
		lastScheduledL1BatchBlockNum: 0,
	}
	p.stats = *stats
	p.vars = *vars

	// Reset the StateDB in TxSelector and BatchBuilder from the
	// synchronizer only if the checkpoint we reset from either:
	// a. Doesn't exist in the TxSelector/BatchBuilder
	// b. The batch has already been synced by the synchronizer and has a
	//    different MTRoot than the BatchBuilder
	// Otherwise, reset from the local checkpoint.

	// First attempt to reset from local checkpoint if such checkpoint exists
	existsTxSelector, err := p.txSelector.LocalAccountsDB().CheckpointExists(p.state.batchNum)
	if err != nil {
		return tracerr.Wrap(err)
	}
	fromSynchronizerTxSelector := !existsTxSelector
	if err := p.txSelector.Reset(p.state.batchNum, fromSynchronizerTxSelector); err != nil {
		return tracerr.Wrap(err)
	}
	existsBatchBuilder, err := p.batchBuilder.LocalStateDB().CheckpointExists(p.state.batchNum)
	if err != nil {
		return tracerr.Wrap(err)
	}
	fromSynchronizerBatchBuilder := !existsBatchBuilder
	if err := p.batchBuilder.Reset(p.state.batchNum, fromSynchronizerBatchBuilder); err != nil {
		return tracerr.Wrap(err)
	}

	// After reset, check that if the batch exists in the historyDB, the
	// stateRoot matches with the local one, if not, force a reset from
	// synchronizer
	batch, err := p.historyDB.GetBatch(p.state.batchNum)
	if tracerr.Unwrap(err) == sql.ErrNoRows {
		// nothing to do
	} else if err != nil {
		return tracerr.Wrap(err)
	} else {
		localStateRoot := p.batchBuilder.LocalStateDB().MT.Root().BigInt()
		if batch.StateRoot.Cmp(localStateRoot) != 0 {
			log.Debugw("localStateRoot (%v) != historyDB stateRoot (%v).  "+
				"Forcing reset from Synchronizer", localStateRoot, batch.StateRoot)
			// StateRoot from synchronizer doesn't match StateRoot
			// from batchBuilder, force a reset from synchronizer
			if err := p.txSelector.Reset(p.state.batchNum, true); err != nil {
				return tracerr.Wrap(err)
			}
			if err := p.batchBuilder.Reset(p.state.batchNum, true); err != nil {
				return tracerr.Wrap(err)
			}
		}
	}
	return nil
}

func (p *Pipeline) syncSCVars(vars synchronizer.SCVariablesPtr) {
	updateSCVars(&p.vars, vars)
}

// handleForgeBatch calls p.forgeBatch to forge the batch and get the zkInputs,
// and then waits for an available proof server and sends the zkInputs to it so
// that the proof computation begins.
func (p *Pipeline) handleForgeBatch(ctx context.Context, batchNum common.BatchNum) (*BatchInfo, error) {
	batchInfo, err := p.forgeBatch(batchNum)
	if ctx.Err() != nil {
		return nil, ctx.Err()
	} else if err != nil {
		if tracerr.Unwrap(err) == errLastL1BatchNotSynced {
			log.Warnw("forgeBatch: scheduled L1Batch too early", "err", err,
				"lastForgeL1TxsNum", p.state.lastForgeL1TxsNum,
				"syncLastForgeL1TxsNum", p.stats.Sync.LastForgeL1TxsNum)
		} else {
			log.Errorw("forgeBatch", "err", err)
		}
		return nil, err
	}
	// 6. Wait for an available server proof (blocking call)
	serverProof, err := p.proversPool.Get(ctx)
	if ctx.Err() != nil {
		return nil, ctx.Err()
	} else if err != nil {
		log.Errorw("proversPool.Get", "err", err)
		return nil, err
	}
	batchInfo.ServerProof = serverProof
	if err := p.sendServerProof(ctx, batchInfo); ctx.Err() != nil {
		return nil, ctx.Err()
	} else if err != nil {
		log.Errorw("sendServerProof", "err", err)
		batchInfo.ServerProof = nil
		p.proversPool.Add(ctx, serverProof)
		return nil, err
	}
	return batchInfo, nil
}

// Start the forging pipeline
func (p *Pipeline) Start(batchNum common.BatchNum,
	stats *synchronizer.Stats, vars *synchronizer.SCVariables) error {
	if p.started {
		log.Fatal("Pipeline already started")
	}
	p.started = true

	if err := p.reset(batchNum, stats, vars); err != nil {
		return tracerr.Wrap(err)
	}
	p.ctx, p.cancel = context.WithCancel(context.Background())

	queueSize := 1
	batchChSentServerProof := make(chan *BatchInfo, queueSize)

	p.wg.Add(1)
	go func() {
		waitDuration := zeroDuration
		for {
			select {
			case <-p.ctx.Done():
				log.Info("Pipeline forgeBatch loop done")
				p.wg.Done()
				return
			case statsVars := <-p.statsVarsCh:
				p.stats = statsVars.Stats
				p.syncSCVars(statsVars.Vars)
			case <-time.After(waitDuration):
				// Once errAtBatchNum != 0, we stop forging
				// batches because there's been an error and we
				// wait for the pipeline to be stopped.
				if p.getErrAtBatchNum() != 0 {
					waitDuration = p.cfg.ForgeRetryInterval
					continue
				}
				batchNum = p.state.batchNum + 1
				batchInfo, err := p.handleForgeBatch(p.ctx, batchNum)
				if p.ctx.Err() != nil {
					continue
				} else if tracerr.Unwrap(err) == errLastL1BatchNotSynced {
					waitDuration = p.cfg.ForgeRetryInterval
					continue
				} else if err != nil {
					p.setErrAtBatchNum(batchNum)
					waitDuration = p.cfg.ForgeRetryInterval
					p.coord.SendMsg(p.ctx, MsgStopPipeline{
						Reason: fmt.Sprintf(
							"Pipeline.handleForgBatch: %v", err),
						FailedBatchNum: batchNum,
					})
					continue
				}

				p.state.batchNum = batchNum
				select {
				case batchChSentServerProof <- batchInfo:
				case <-p.ctx.Done():
				}
			}
		}
	}()

	p.wg.Add(1)
	go func() {
		for {
			select {
			case <-p.ctx.Done():
				log.Info("Pipeline waitServerProofSendEth loop done")
				p.wg.Done()
				return
			case batchInfo := <-batchChSentServerProof:
				// Once errAtBatchNum != 0, we stop forging
				// batches because there's been an error and we
				// wait for the pipeline to be stopped.
				if p.getErrAtBatchNum() != 0 {
					continue
				}
				err := p.waitServerProof(p.ctx, batchInfo)
				batchInfo.ServerProof = nil
				if p.ctx.Err() != nil {
					continue
				} else if err != nil {
					log.Errorw("waitServerProof", "err", err)
					p.setErrAtBatchNum(batchInfo.BatchNum)
					p.coord.SendMsg(p.ctx, MsgStopPipeline{
						Reason: fmt.Sprintf(
							"Pipeline.waitServerProof: %v", err),
						FailedBatchNum: batchInfo.BatchNum,
					})
					continue
				}
				// We are done with this serverProof, add it back to the pool
				p.proversPool.Add(p.ctx, batchInfo.ServerProof)
				p.txManager.AddBatch(p.ctx, batchInfo)
			}
		}
	}()
	return nil
}

// Stop the forging pipeline
func (p *Pipeline) Stop(ctx context.Context) {
	if !p.started {
		log.Fatal("Pipeline already stopped")
	}
	p.started = false
	log.Info("Stopping Pipeline...")
	p.cancel()
	p.wg.Wait()
	for _, prover := range p.provers {
		if err := prover.Cancel(ctx); ctx.Err() != nil {
			continue
		} else if err != nil {
			log.Errorw("prover.Cancel", "err", err)
		}
	}
}

// sendServerProof sends the circuit inputs to the proof server
func (p *Pipeline) sendServerProof(ctx context.Context, batchInfo *BatchInfo) error {
	p.cfg.debugBatchStore(batchInfo)

	// 7. Call the selected idle server proof with BatchBuilder output,
	// save server proof info for batchNum
	if err := batchInfo.ServerProof.CalculateProof(ctx, batchInfo.ZKInputs); err != nil {
		return tracerr.Wrap(err)
	}
	return nil
}

// forgeBatch forges the batchNum batch.
func (p *Pipeline) forgeBatch(batchNum common.BatchNum) (batchInfo *BatchInfo, err error) {
	// remove transactions from the pool that have been there for too long
	_, err = p.purger.InvalidateMaybe(p.l2DB, p.txSelector.LocalAccountsDB(),
		p.stats.Sync.LastBlock.Num, int64(batchNum))
	if err != nil {
		return nil, tracerr.Wrap(err)
	}
	_, err = p.purger.PurgeMaybe(p.l2DB, p.stats.Sync.LastBlock.Num, int64(batchNum))
	if err != nil {
		return nil, tracerr.Wrap(err)
	}
	// Structure to accumulate data and metadata of the batch
	batchInfo = &BatchInfo{PipelineNum: p.num, BatchNum: batchNum}
	batchInfo.Debug.StartTimestamp = time.Now()
	batchInfo.Debug.StartBlockNum = p.stats.Eth.LastBlock.Num + 1

	selectionCfg := &txselector.SelectionConfig{
		MaxL1UserTxs:      common.RollupConstMaxL1UserTx,
		TxProcessorConfig: p.cfg.TxProcessorConfig,
	}

	var poolL2Txs []common.PoolL2Tx
	var discardedL2Txs []common.PoolL2Tx
	var l1UserTxsExtra, l1CoordTxs []common.L1Tx
	var auths [][]byte
	var coordIdxs []common.Idx

	// TODO: If there are no txs and we are behind the timeout, skip
	// forging a batch and return a particular error that can be handleded
	// in the loop where handleForgeBatch is called to retry after an
	// interval

	// 1. Decide if we forge L2Tx or L1+L2Tx
	if p.shouldL1L2Batch(batchInfo) {
		batchInfo.L1Batch = true
		if p.state.lastForgeL1TxsNum != p.stats.Sync.LastForgeL1TxsNum {
			return nil, tracerr.Wrap(errLastL1BatchNotSynced)
		}
		// 2a: L1+L2 txs
		l1UserTxs, err := p.historyDB.GetUnforgedL1UserTxs(p.state.lastForgeL1TxsNum + 1)
		if err != nil {
			return nil, tracerr.Wrap(err)
		}
		coordIdxs, auths, l1UserTxsExtra, l1CoordTxs, poolL2Txs, discardedL2Txs, err =
			p.txSelector.GetL1L2TxSelection(selectionCfg, l1UserTxs)
		if err != nil {
			return nil, tracerr.Wrap(err)
		}

		p.state.lastScheduledL1BatchBlockNum = p.stats.Eth.LastBlock.Num + 1
		p.state.lastForgeL1TxsNum++
	} else {
		// 2b: only L2 txs
		coordIdxs, auths, l1CoordTxs, poolL2Txs, discardedL2Txs, err =
			p.txSelector.GetL2TxSelection(selectionCfg)
		if err != nil {
			return nil, tracerr.Wrap(err)
		}
		l1UserTxsExtra = nil
	}

	// 3.  Save metadata from TxSelector output for BatchNum
	batchInfo.L1UserTxsExtra = l1UserTxsExtra
	batchInfo.L1CoordTxs = l1CoordTxs
	batchInfo.L1CoordinatorTxsAuths = auths
	batchInfo.CoordIdxs = coordIdxs
	batchInfo.VerifierIdx = p.cfg.VerifierIdx

	if err := p.l2DB.StartForging(common.TxIDsFromPoolL2Txs(poolL2Txs), batchInfo.BatchNum); err != nil {
		return nil, tracerr.Wrap(err)
	}
	if err := p.l2DB.UpdateTxsInfo(discardedL2Txs); err != nil {
		return nil, tracerr.Wrap(err)
	}

	// Invalidate transactions that become invalid beause of
	// the poolL2Txs selected.  Will mark as invalid the txs that have a
	// (fromIdx, nonce) which already appears in the selected txs (includes
	// all the nonces smaller than the current one)
	err = p.l2DB.InvalidateOldNonces(idxsNonceFromPoolL2Txs(poolL2Txs), batchInfo.BatchNum)
	if err != nil {
		return nil, tracerr.Wrap(err)
	}

	// 4. Call BatchBuilder with TxSelector output
	configBatch := &batchbuilder.ConfigBatch{
		TxProcessorConfig: p.cfg.TxProcessorConfig,
	}
	zkInputs, err := p.batchBuilder.BuildBatch(coordIdxs, configBatch, l1UserTxsExtra,
		l1CoordTxs, poolL2Txs)
	if err != nil {
		return nil, tracerr.Wrap(err)
	}
	l2Txs, err := common.PoolL2TxsToL2Txs(poolL2Txs) // NOTE: This is a big uggly, find a better way
	if err != nil {
		return nil, tracerr.Wrap(err)
	}
	batchInfo.L2Txs = l2Txs

	// 5. Save metadata from BatchBuilder output for BatchNum
	batchInfo.ZKInputs = zkInputs
	batchInfo.Debug.Status = StatusForged
	p.cfg.debugBatchStore(batchInfo)
	log.Infow("Pipeline: batch forged internally", "batch", batchInfo.BatchNum)

	return batchInfo, nil
}

// waitServerProof gets the generated zkProof & sends it to the SmartContract
func (p *Pipeline) waitServerProof(ctx context.Context, batchInfo *BatchInfo) error {
	proof, pubInputs, err := batchInfo.ServerProof.GetProof(ctx) // blocking call, until not resolved don't continue. Returns when the proof server has calculated the proof
	if err != nil {
		return tracerr.Wrap(err)
	}
	batchInfo.Proof = proof
	batchInfo.PublicInputs = pubInputs
	batchInfo.ForgeBatchArgs = prepareForgeBatchArgs(batchInfo)
	batchInfo.Debug.Status = StatusProof
	p.cfg.debugBatchStore(batchInfo)
	log.Infow("Pipeline: batch proof calculated", "batch", batchInfo.BatchNum)
	return nil
}

func (p *Pipeline) shouldL1L2Batch(batchInfo *BatchInfo) bool {
	// Take the lastL1BatchBlockNum as the biggest between the last
	// scheduled one, and the synchronized one.
	lastL1BatchBlockNum := p.state.lastScheduledL1BatchBlockNum
	if p.stats.Sync.LastL1BatchBlock > lastL1BatchBlockNum {
		lastL1BatchBlockNum = p.stats.Sync.LastL1BatchBlock
	}
	// Set Debug information
	batchInfo.Debug.LastScheduledL1BatchBlockNum = p.state.lastScheduledL1BatchBlockNum
	batchInfo.Debug.LastL1BatchBlock = p.stats.Sync.LastL1BatchBlock
	batchInfo.Debug.LastL1BatchBlockDelta = p.stats.Eth.LastBlock.Num + 1 - lastL1BatchBlockNum
	batchInfo.Debug.L1BatchBlockScheduleDeadline =
		int64(float64(p.vars.Rollup.ForgeL1L2BatchTimeout-1) * p.cfg.L1BatchTimeoutPerc)
	// Return true if we have passed the l1BatchTimeoutPerc portion of the
	// range before the l1batch timeout.
	return p.stats.Eth.LastBlock.Num+1-lastL1BatchBlockNum >=
		int64(float64(p.vars.Rollup.ForgeL1L2BatchTimeout-1)*p.cfg.L1BatchTimeoutPerc)
}

func prepareForgeBatchArgs(batchInfo *BatchInfo) *eth.RollupForgeBatchArgs {
	proof := batchInfo.Proof
	zki := batchInfo.ZKInputs
	return &eth.RollupForgeBatchArgs{
		NewLastIdx:            int64(zki.Metadata.NewLastIdxRaw),
		NewStRoot:             zki.Metadata.NewStateRootRaw.BigInt(),
		NewExitRoot:           zki.Metadata.NewExitRootRaw.BigInt(),
		L1UserTxs:             batchInfo.L1UserTxsExtra,
		L1CoordinatorTxs:      batchInfo.L1CoordTxs,
		L1CoordinatorTxsAuths: batchInfo.L1CoordinatorTxsAuths,
		L2TxsData:             batchInfo.L2Txs,
		FeeIdxCoordinator:     batchInfo.CoordIdxs,
		// Circuit selector
		VerifierIdx: batchInfo.VerifierIdx,
		L1Batch:     batchInfo.L1Batch,
		ProofA:      [2]*big.Int{proof.PiA[0], proof.PiA[1]},
		// Implementation of the verifier need a swap on the proofB vector
		ProofB: [2][2]*big.Int{
			{proof.PiB[0][1], proof.PiB[0][0]},
			{proof.PiB[1][1], proof.PiB[1][0]},
		},
		ProofC: [2]*big.Int{proof.PiC[0], proof.PiC[1]},
	}
}