arnaucube
/
hermez-node
mirror of https://github.com/arnaucube/hermez-node.git


								package coordinator


								import (

									"context"

									"fmt"

									"math/big"

									"os"

									"sync"

									"time"


									ethCommon "github.com/ethereum/go-ethereum/common"

									"github.com/hermeznetwork/hermez-node/batchbuilder"

									"github.com/hermeznetwork/hermez-node/common"

									"github.com/hermeznetwork/hermez-node/config"

									"github.com/hermeznetwork/hermez-node/db/historydb"

									"github.com/hermeznetwork/hermez-node/db/l2db"

									"github.com/hermeznetwork/hermez-node/eth"

									"github.com/hermeznetwork/hermez-node/log"

									"github.com/hermeznetwork/hermez-node/prover"

									"github.com/hermeznetwork/hermez-node/synchronizer"

									"github.com/hermeznetwork/hermez-node/txprocessor"

									"github.com/hermeznetwork/hermez-node/txselector"

									"github.com/hermeznetwork/tracerr"

								)


								var (

									errLastL1BatchNotSynced  = fmt.Errorf("last L1Batch not synced yet")

									errForgeNoTxsBeforeDelay = fmt.Errorf("no txs to forge and we haven't reached the forge no txs delay")

									errForgeBeforeDelay      = fmt.Errorf("we haven't reached the forge delay")

								)


								const (

									queueLen         = 16

									longWaitDuration = 999 * time.Hour

									zeroDuration     = 0 * time.Second

								)


								// Config contains the Coordinator configuration

								type Config struct {

									// ForgerAddress is the address under which this coordinator is forging

									ForgerAddress ethCommon.Address

									// ConfirmBlocks is the number of confirmation blocks to wait for sent

									// ethereum transactions before forgetting about them

									ConfirmBlocks int64

									// L1BatchTimeoutPerc is the portion of the range before the L1Batch

									// timeout that will trigger a schedule to forge an L1Batch

									L1BatchTimeoutPerc float64

									// StartSlotBlocksDelay is the number of blocks of delay to wait before

									// starting the pipeline when we reach a slot in which we can forge.

									StartSlotBlocksDelay int64

									// ScheduleBatchBlocksAheadCheck is the number of blocks ahead in which

									// the forger address is checked to be allowed to forge (apart from

									// checking the next block), used to decide when to stop scheduling new

									// batches (by stopping the pipeline).

									// For example, if we are at block 10 and ScheduleBatchBlocksAheadCheck

									// is 5, eventhough at block 11 we canForge, the pipeline will be

									// stopped if we can't forge at block 15.

									// This value should be the expected number of blocks it takes between

									// scheduling a batch and having it mined.

									ScheduleBatchBlocksAheadCheck int64

									// SendBatchBlocksMarginCheck is the number of margin blocks ahead in

									// which the coordinator is also checked to be allowed to forge, apart

									// from the next block; used to decide when to stop sending batches to

									// the smart contract.

									// For example, if we are at block 10 and SendBatchBlocksMarginCheck is

									// 5, eventhough at block 11 we canForge, the batch will be discarded

									// if we can't forge at block 15.

									// This value should be the expected number of blocks it takes between

									// sending a batch and having it mined.

									SendBatchBlocksMarginCheck int64

									// EthClientAttempts is the number of attempts to do an eth client RPC

									// call before giving up

									EthClientAttempts int

									// ForgeRetryInterval is the waiting interval between calls forge a

									// batch after an error

									ForgeRetryInterval time.Duration

									// ForgeDelay is the delay after which a batch is forged if the slot is

									// already committed.  If set to 0s, the coordinator will continuously

									// forge at the maximum rate.

									ForgeDelay time.Duration

									// ForgeNoTxsDelay is the delay after which a batch is forged even if

									// there are no txs to forge if the slot is already committed.  If set

									// to 0s, the coordinator will continuously forge even if the batches

									// are empty.

									ForgeNoTxsDelay time.Duration

									// SyncRetryInterval is the waiting interval between calls to the main

									// handler of a synced block after an error

									SyncRetryInterval time.Duration

									// PurgeByExtDelInterval is the waiting interval between calls

									// to the PurgeByExternalDelete function of the l2db which deletes

									// pending txs externally marked by the column `external_delete`

									PurgeByExtDelInterval time.Duration

									// EthClientAttemptsDelay is delay between attempts do do an eth client

									// RPC call

									EthClientAttemptsDelay time.Duration

									// EthTxResendTimeout is the timeout after which a non-mined ethereum

									// transaction will be resent (reusing the nonce) with a newly

									// calculated gas price

									EthTxResendTimeout time.Duration

									// EthNoReuseNonce disables reusing nonces of pending transactions for

									// new replacement transactions

									EthNoReuseNonce bool

									// MaxGasPrice is the maximum gas price allowed for ethereum

									// transactions

									MaxGasPrice *big.Int

									// GasPriceIncPerc is the percentage increase of gas price set in an

									// ethereum transaction from the suggested gas price by the ehtereum

									// node

									GasPriceIncPerc int64

									// TxManagerCheckInterval is the waiting interval between receipt

									// checks of ethereum transactions in the TxManager

									TxManagerCheckInterval time.Duration

									// DebugBatchPath if set, specifies the path where batchInfo is stored

									// in JSON in every step/update of the pipeline

									DebugBatchPath string

									Purger         PurgerCfg

									// VerifierIdx is the index of the verifier contract registered in the

									// smart contract

									VerifierIdx uint8

									// ForgeBatchGasCost contains the cost of each action in the

									// ForgeBatch transaction.

									ForgeBatchGasCost config.ForgeBatchGasCost

									TxProcessorConfig txprocessor.Config

								}


								func (c *Config) debugBatchStore(batchInfo *BatchInfo) {

									if c.DebugBatchPath != "" {

										if err := batchInfo.DebugStore(c.DebugBatchPath); err != nil {

											log.Warnw("Error storing debug BatchInfo",

												"path", c.DebugBatchPath, "err", err)

										}

									}

								}


								type fromBatch struct {

									BatchNum   common.BatchNum

									ForgerAddr ethCommon.Address

									StateRoot  *big.Int

								}


								// Coordinator implements the Coordinator type

								type Coordinator struct {

									// State

									pipelineNum       int       // Pipeline sequential number.  The first pipeline is 1

									pipelineFromBatch fromBatch // batch from which we started the pipeline

									provers           []prover.Client

									consts            synchronizer.SCConsts

									vars              synchronizer.SCVariables

									stats             synchronizer.Stats

									started           bool


									cfg Config


									historyDB    *historydb.HistoryDB

									l2DB         *l2db.L2DB

									txSelector   *txselector.TxSelector

									batchBuilder *batchbuilder.BatchBuilder


									msgCh  chan interface{}

									ctx    context.Context

									wg     sync.WaitGroup

									cancel context.CancelFunc


									// mutexL2DBUpdateDelete protects updates to the L2DB so that

									// these two processes always happen exclusively:

									// - Pipeline taking pending txs, running through the TxProcessor and

									//   marking selected txs as forging

									// - Coordinator deleting pending txs that have been marked with

									//   `external_delete`.

									// Without this mutex, the coordinator could delete a pending txs that

									// has just been selected by the TxProcessor in the pipeline.

									mutexL2DBUpdateDelete sync.Mutex

									pipeline              *Pipeline

									lastNonFailedBatchNum common.BatchNum


									purger    *Purger

									txManager *TxManager

								}


								// NewCoordinator creates a new Coordinator

								func NewCoordinator(cfg Config,

									historyDB *historydb.HistoryDB,

									l2DB *l2db.L2DB,

									txSelector *txselector.TxSelector,

									batchBuilder *batchbuilder.BatchBuilder,

									serverProofs []prover.Client,

									ethClient eth.ClientInterface,

									scConsts *common.SCConsts,

									initSCVars *common.SCVariables,

								) (*Coordinator, error) {

									// nolint reason: hardcoded `1.0`, by design the percentage can't be over 100%

									if cfg.L1BatchTimeoutPerc >= 1.0 { //nolint:gomnd

										return nil, tracerr.Wrap(fmt.Errorf("invalid value for Config.L1BatchTimeoutPerc (%v >= 1.0)",

											cfg.L1BatchTimeoutPerc))

									}

									if cfg.EthClientAttempts < 1 {

										return nil, tracerr.Wrap(fmt.Errorf("invalid value for Config.EthClientAttempts (%v < 1)",

											cfg.EthClientAttempts))

									}


									if cfg.DebugBatchPath != "" {

										if err := os.MkdirAll(cfg.DebugBatchPath, 0744); err != nil {

											return nil, tracerr.Wrap(err)

										}

									}


									purger := Purger{

										cfg:                 cfg.Purger,

										lastPurgeBlock:      0,

										lastPurgeBatch:      0,

										lastInvalidateBlock: 0,

										lastInvalidateBatch: 0,

									}


									ctx, cancel := context.WithCancel(context.Background())

									c := Coordinator{

										pipelineNum: 0,

										pipelineFromBatch: fromBatch{

											BatchNum:   0,

											ForgerAddr: ethCommon.Address{},

											StateRoot:  big.NewInt(0),

										},

										provers: serverProofs,

										consts:  *scConsts,

										vars:    *initSCVars,


										cfg: cfg,


										historyDB:    historyDB,

										l2DB:         l2DB,

										txSelector:   txSelector,

										batchBuilder: batchBuilder,


										purger: &purger,


										msgCh: make(chan interface{}),

										ctx:   ctx,

										// wg

										cancel: cancel,

									}

									ctxTimeout, ctxTimeoutCancel := context.WithTimeout(ctx, 1*time.Second)

									defer ctxTimeoutCancel()

									txManager, err := NewTxManager(ctxTimeout, &cfg, ethClient, l2DB, &c,

										scConsts, initSCVars)

									if err != nil {

										return nil, tracerr.Wrap(err)

									}

									c.txManager = txManager

									// Set Eth LastBlockNum to -1 in stats so that stats.Synced() is

									// guaranteed to return false before it's updated with a real stats

									c.stats.Eth.LastBlock.Num = -1

									return &c, nil

								}


								// TxSelector returns the inner TxSelector

								func (c *Coordinator) TxSelector() *txselector.TxSelector {

									return c.txSelector

								}


								// BatchBuilder returns the inner BatchBuilder

								func (c *Coordinator) BatchBuilder() *batchbuilder.BatchBuilder {

									return c.batchBuilder

								}


								func (c *Coordinator) newPipeline(ctx context.Context) (*Pipeline, error) {

									c.pipelineNum++

									return NewPipeline(ctx, c.cfg, c.pipelineNum, c.historyDB, c.l2DB, c.txSelector,

										c.batchBuilder, &c.mutexL2DBUpdateDelete, c.purger, c, c.txManager,

										c.provers, &c.consts)

								}


								// MsgSyncBlock indicates an update to the Synchronizer stats

								type MsgSyncBlock struct {

									Stats   synchronizer.Stats

									Batches []common.BatchData

									// Vars contains each Smart Contract variables if they are updated, or

									// nil if they haven't changed.

									Vars synchronizer.SCVariablesPtr

								}


								// MsgSyncReorg indicates a reorg

								type MsgSyncReorg struct {

									Stats synchronizer.Stats

									Vars  synchronizer.SCVariablesPtr

								}


								// MsgStopPipeline indicates a signal to reset the pipeline

								type MsgStopPipeline struct {

									Reason string

									// FailedBatchNum indicates the first batchNum that failed in the

									// pipeline.  If FailedBatchNum is 0, it should be ignored.

									FailedBatchNum common.BatchNum

								}


								// SendMsg is a thread safe method to pass a message to the Coordinator

								func (c *Coordinator) SendMsg(ctx context.Context, msg interface{}) {

									select {

									case c.msgCh <- msg:

									case <-ctx.Done():

									}

								}


								func updateSCVars(vars *synchronizer.SCVariables, update synchronizer.SCVariablesPtr) {

									if update.Rollup != nil {

										vars.Rollup = *update.Rollup

									}

									if update.Auction != nil {

										vars.Auction = *update.Auction

									}

									if update.WDelayer != nil {

										vars.WDelayer = *update.WDelayer

									}

								}


								func (c *Coordinator) syncSCVars(vars synchronizer.SCVariablesPtr) {

									updateSCVars(&c.vars, vars)

								}


								func canForge(auctionConstants *common.AuctionConstants, auctionVars *common.AuctionVariables,

									currentSlot *common.Slot, nextSlot *common.Slot, addr ethCommon.Address, blockNum int64) bool {

									if blockNum < auctionConstants.GenesisBlockNum {

										log.Infow("canForge: requested blockNum is < genesis", "blockNum", blockNum,

											"genesis", auctionConstants.GenesisBlockNum)

										return false

									}

									var slot *common.Slot

									if currentSlot.StartBlock <= blockNum && blockNum <= currentSlot.EndBlock {

										slot = currentSlot

									} else if nextSlot.StartBlock <= blockNum && blockNum <= nextSlot.EndBlock {

										slot = nextSlot

									} else {

										log.Warnw("canForge: requested blockNum is outside current and next slot",

											"blockNum", blockNum, "currentSlot", currentSlot,

											"nextSlot", nextSlot,

										)

										return false

									}

									anyoneForge := false

									if !slot.ForgerCommitment &&

										auctionConstants.RelativeBlock(blockNum) >= int64(auctionVars.SlotDeadline) {

										log.Debugw("canForge: anyone can forge in the current slot (slotDeadline passed)",

											"block", blockNum)

										anyoneForge = true

									}

									if slot.Forger == addr || anyoneForge {

										return true

									}

									log.Debugw("canForge: can't forge", "slot.Forger", slot.Forger)

									return false

								}


								func (c *Coordinator) canForgeAt(blockNum int64) bool {

									return canForge(&c.consts.Auction, &c.vars.Auction,

										&c.stats.Sync.Auction.CurrentSlot, &c.stats.Sync.Auction.NextSlot,

										c.cfg.ForgerAddress, blockNum)

								}


								func (c *Coordinator) canForge() bool {

									blockNum := c.stats.Eth.LastBlock.Num + 1

									return canForge(&c.consts.Auction, &c.vars.Auction,

										&c.stats.Sync.Auction.CurrentSlot, &c.stats.Sync.Auction.NextSlot,

										c.cfg.ForgerAddress, blockNum)

								}


								func (c *Coordinator) syncStats(ctx context.Context, stats *synchronizer.Stats) error {

									nextBlock := c.stats.Eth.LastBlock.Num + 1

									canForge := c.canForgeAt(nextBlock)

									if c.cfg.ScheduleBatchBlocksAheadCheck != 0 && canForge {

										canForge = c.canForgeAt(nextBlock + c.cfg.ScheduleBatchBlocksAheadCheck)

									}

									if c.pipeline == nil {

										relativeBlock := c.consts.Auction.RelativeBlock(nextBlock)

										if canForge && relativeBlock < c.cfg.StartSlotBlocksDelay {

											log.Debugf("Coordinator: delaying pipeline start due to "+

												"relativeBlock (%v) < cfg.StartSlotBlocksDelay (%v)",

												relativeBlock, c.cfg.StartSlotBlocksDelay)

										} else if canForge {

											log.Infow("Coordinator: forging state begin", "block",

												stats.Eth.LastBlock.Num+1, "batch", stats.Sync.LastBatch.BatchNum)

											fromBatch := fromBatch{

												BatchNum:   stats.Sync.LastBatch.BatchNum,

												ForgerAddr: stats.Sync.LastBatch.ForgerAddr,

												StateRoot:  stats.Sync.LastBatch.StateRoot,

											}

											if c.lastNonFailedBatchNum > fromBatch.BatchNum {

												fromBatch.BatchNum = c.lastNonFailedBatchNum

												fromBatch.ForgerAddr = c.cfg.ForgerAddress

												fromBatch.StateRoot = big.NewInt(0)

											}

											// Before starting the pipeline make sure we reset any

											// l2tx from the pool that was forged in a batch that

											// didn't end up being mined.  We are already doing

											// this in handleStopPipeline, but we do it again as a

											// failsafe in case the last synced batchnum is

											// different than in the previous call to l2DB.Reorg,

											// or in case the node was restarted when there was a

											// started batch that included l2txs but was not mined.

											if err := c.l2DB.Reorg(fromBatch.BatchNum); err != nil {

												return tracerr.Wrap(err)

											}

											var err error

											if c.pipeline, err = c.newPipeline(ctx); err != nil {

												return tracerr.Wrap(err)

											}

											c.pipelineFromBatch = fromBatch

											// Start the pipeline

											if err := c.pipeline.Start(fromBatch.BatchNum, stats, &c.vars); err != nil {

												c.pipeline = nil

												return tracerr.Wrap(err)

											}

										}

									} else {

										if !canForge {

											log.Infow("Coordinator: forging state end", "block", stats.Eth.LastBlock.Num+1)

											c.pipeline.Stop(c.ctx)

											c.pipeline = nil

										}

									}

									if c.pipeline == nil {

										if _, err := c.purger.InvalidateMaybe(c.l2DB, c.txSelector.LocalAccountsDB(),

											stats.Sync.LastBlock.Num, int64(stats.Sync.LastBatch.BatchNum)); err != nil {

											return tracerr.Wrap(err)

										}

										if _, err := c.purger.PurgeMaybe(c.l2DB, stats.Sync.LastBlock.Num,

											int64(stats.Sync.LastBatch.BatchNum)); err != nil {

											return tracerr.Wrap(err)

										}

									}

									return nil

								}


								func (c *Coordinator) handleMsgSyncBlock(ctx context.Context, msg *MsgSyncBlock) error {

									c.stats = msg.Stats

									c.syncSCVars(msg.Vars)

									c.txManager.SetSyncStatsVars(ctx, &msg.Stats, &msg.Vars)

									if c.pipeline != nil {

										c.pipeline.SetSyncStatsVars(ctx, &msg.Stats, &msg.Vars)

									}

									if !c.stats.Synced() {

										return nil

									}

									return c.syncStats(ctx, &c.stats)

								}


								func (c *Coordinator) handleReorg(ctx context.Context, msg *MsgSyncReorg) error {

									c.stats = msg.Stats

									c.syncSCVars(msg.Vars)

									c.txManager.SetSyncStatsVars(ctx, &msg.Stats, &msg.Vars)

									if c.pipeline != nil {

										c.pipeline.SetSyncStatsVars(ctx, &msg.Stats, &msg.Vars)

									}

									if c.stats.Sync.LastBatch.ForgerAddr != c.cfg.ForgerAddress &&

										(c.stats.Sync.LastBatch.StateRoot == nil || c.pipelineFromBatch.StateRoot == nil ||

											c.stats.Sync.LastBatch.StateRoot.Cmp(c.pipelineFromBatch.StateRoot) != 0) {

										// There's been a reorg and the batch state root from which the

										// pipeline was started has changed (probably because it was in

										// a block that was discarded), and it was sent by a different

										// coordinator than us.  That batch may never be in the main

										// chain, so we stop the pipeline  (it will be started again

										// once the node is in sync).

										log.Infow("Coordinator.handleReorg StopPipeline sync.LastBatch.ForgerAddr != cfg.ForgerAddr "+

											"& sync.LastBatch.StateRoot != pipelineFromBatch.StateRoot",

											"sync.LastBatch.StateRoot", c.stats.Sync.LastBatch.StateRoot,

											"pipelineFromBatch.StateRoot", c.pipelineFromBatch.StateRoot)

										c.txManager.DiscardPipeline(ctx, c.pipelineNum)

										if err := c.handleStopPipeline(ctx, "reorg", 0); err != nil {

											return tracerr.Wrap(err)

										}

									}

									return nil

								}


								// handleStopPipeline handles stopping the pipeline.  If failedBatchNum is 0,

								// the next pipeline will start from the last state of the synchronizer,

								// otherwise, it will state from failedBatchNum-1.

								func (c *Coordinator) handleStopPipeline(ctx context.Context, reason string, failedBatchNum common.BatchNum) error {

									batchNum := c.stats.Sync.LastBatch.BatchNum

									if failedBatchNum != 0 {

										batchNum = failedBatchNum - 1

									}

									if c.pipeline != nil {

										c.pipeline.Stop(c.ctx)

										c.pipeline = nil

									}

									if err := c.l2DB.Reorg(batchNum); err != nil {

										return tracerr.Wrap(err)

									}

									c.lastNonFailedBatchNum = batchNum

									return nil

								}


								func (c *Coordinator) handleMsg(ctx context.Context, msg interface{}) error {

									switch msg := msg.(type) {

									case MsgSyncBlock:

										if err := c.handleMsgSyncBlock(ctx, &msg); err != nil {

											return tracerr.Wrap(fmt.Errorf("Coordinator.handleMsgSyncBlock error: %w", err))

										}

									case MsgSyncReorg:

										if err := c.handleReorg(ctx, &msg); err != nil {

											return tracerr.Wrap(fmt.Errorf("Coordinator.handleReorg error: %w", err))

										}

									case MsgStopPipeline:

										log.Infow("Coordinator received MsgStopPipeline", "reason", msg.Reason)

										if err := c.handleStopPipeline(ctx, msg.Reason, msg.FailedBatchNum); err != nil {

											return tracerr.Wrap(fmt.Errorf("Coordinator.handleStopPipeline: %w", err))

										}

									default:

										log.Fatalw("Coordinator Unexpected Coordinator msg of type %T: %+v", msg, msg)

									}

									return nil

								}


								// Start the coordinator

								func (c *Coordinator) Start() {

									if c.started {

										log.Fatal("Coordinator already started")

									}

									c.started = true

									c.wg.Add(1)

									go func() {

										c.txManager.Run(c.ctx)

										c.wg.Done()

									}()


									c.wg.Add(1)

									go func() {

										timer := time.NewTimer(longWaitDuration)

										for {

											select {

											case <-c.ctx.Done():

												log.Info("Coordinator done")

												c.wg.Done()

												return

											case msg := <-c.msgCh:

												if err := c.handleMsg(c.ctx, msg); c.ctx.Err() != nil {

													continue

												} else if err != nil {

													log.Errorw("Coordinator.handleMsg", "err", err)

													if !timer.Stop() {

														<-timer.C

													}

													timer.Reset(c.cfg.SyncRetryInterval)

													continue

												}

											case <-timer.C:

												timer.Reset(longWaitDuration)

												if !c.stats.Synced() {

													continue

												}

												if err := c.syncStats(c.ctx, &c.stats); c.ctx.Err() != nil {

													continue

												} else if err != nil {

													log.Errorw("Coordinator.syncStats", "err", err)

													if !timer.Stop() {

														<-timer.C

													}

													timer.Reset(c.cfg.SyncRetryInterval)

													continue

												}

											}

										}

									}()


									c.wg.Add(1)

									go func() {

										for {

											select {

											case <-c.ctx.Done():

												log.Info("Coordinator L2DB.PurgeByExternalDelete loop done")

												c.wg.Done()

												return

											case <-time.After(c.cfg.PurgeByExtDelInterval):

												c.mutexL2DBUpdateDelete.Lock()

												if err := c.l2DB.PurgeByExternalDelete(); err != nil {

													log.Errorw("L2DB.PurgeByExternalDelete", "err", err)

												}

												c.mutexL2DBUpdateDelete.Unlock()

											}

										}

									}()

								}


								const stopCtxTimeout = 200 * time.Millisecond


								// Stop the coordinator

								func (c *Coordinator) Stop() {

									if !c.started {

										log.Fatal("Coordinator already stopped")

									}

									c.started = false

									log.Infow("Stopping Coordinator...")

									c.cancel()

									c.wg.Wait()

									if c.pipeline != nil {

										ctx, cancel := context.WithTimeout(context.Background(), stopCtxTimeout)

										defer cancel()

										c.pipeline.Stop(ctx)

										c.pipeline = nil

									}

								}