package coordinator import ( "context" "fmt" "math/big" "os" "sync" "time" ethCommon "github.com/ethereum/go-ethereum/common" "github.com/hermeznetwork/hermez-node/batchbuilder" "github.com/hermeznetwork/hermez-node/common" "github.com/hermeznetwork/hermez-node/config" "github.com/hermeznetwork/hermez-node/db/historydb" "github.com/hermeznetwork/hermez-node/db/l2db" "github.com/hermeznetwork/hermez-node/eth" "github.com/hermeznetwork/hermez-node/log" "github.com/hermeznetwork/hermez-node/prover" "github.com/hermeznetwork/hermez-node/synchronizer" "github.com/hermeznetwork/hermez-node/txprocessor" "github.com/hermeznetwork/hermez-node/txselector" "github.com/hermeznetwork/tracerr" ) var ( errLastL1BatchNotSynced = fmt.Errorf("last L1Batch not synced yet") errForgeNoTxsBeforeDelay = fmt.Errorf("no txs to forge and we haven't reached the forge no txs delay") errForgeBeforeDelay = fmt.Errorf("we haven't reached the forge delay") ) const ( queueLen = 16 longWaitDuration = 999 * time.Hour zeroDuration = 0 * time.Second ) // Config contains the Coordinator configuration type Config struct { // ForgerAddress is the address under which this coordinator is forging ForgerAddress ethCommon.Address // ConfirmBlocks is the number of confirmation blocks to wait for sent // ethereum transactions before forgetting about them ConfirmBlocks int64 // L1BatchTimeoutPerc is the portion of the range before the L1Batch // timeout that will trigger a schedule to forge an L1Batch L1BatchTimeoutPerc float64 // StartSlotBlocksDelay is the number of blocks of delay to wait before // starting the pipeline when we reach a slot in which we can forge. StartSlotBlocksDelay int64 // ScheduleBatchBlocksAheadCheck is the number of blocks ahead in which // the forger address is checked to be allowed to forge (apart from // checking the next block), used to decide when to stop scheduling new // batches (by stopping the pipeline). // For example, if we are at block 10 and ScheduleBatchBlocksAheadCheck // is 5, eventhough at block 11 we canForge, the pipeline will be // stopped if we can't forge at block 15. // This value should be the expected number of blocks it takes between // scheduling a batch and having it mined. ScheduleBatchBlocksAheadCheck int64 // SendBatchBlocksMarginCheck is the number of margin blocks ahead in // which the coordinator is also checked to be allowed to forge, apart // from the next block; used to decide when to stop sending batches to // the smart contract. // For example, if we are at block 10 and SendBatchBlocksMarginCheck is // 5, eventhough at block 11 we canForge, the batch will be discarded // if we can't forge at block 15. // This value should be the expected number of blocks it takes between // sending a batch and having it mined. SendBatchBlocksMarginCheck int64 // EthClientAttempts is the number of attempts to do an eth client RPC // call before giving up EthClientAttempts int // ForgeRetryInterval is the waiting interval between calls forge a // batch after an error ForgeRetryInterval time.Duration // ForgeDelay is the delay after which a batch is forged if the slot is // already committed. If set to 0s, the coordinator will continuously // forge at the maximum rate. ForgeDelay time.Duration // ForgeNoTxsDelay is the delay after which a batch is forged even if // there are no txs to forge if the slot is already committed. If set // to 0s, the coordinator will continuously forge even if the batches // are empty. ForgeNoTxsDelay time.Duration // SyncRetryInterval is the waiting interval between calls to the main // handler of a synced block after an error SyncRetryInterval time.Duration // PurgeByExtDelInterval is the waiting interval between calls // to the PurgeByExternalDelete function of the l2db which deletes // pending txs externally marked by the column `external_delete` PurgeByExtDelInterval time.Duration // EthClientAttemptsDelay is delay between attempts do do an eth client // RPC call EthClientAttemptsDelay time.Duration // EthTxResendTimeout is the timeout after which a non-mined ethereum // transaction will be resent (reusing the nonce) with a newly // calculated gas price EthTxResendTimeout time.Duration // EthNoReuseNonce disables reusing nonces of pending transactions for // new replacement transactions EthNoReuseNonce bool // MaxGasPrice is the maximum gas price allowed for ethereum // transactions MaxGasPrice *big.Int // GasPriceIncPerc is the percentage increase of gas price set in an // ethereum transaction from the suggested gas price by the ehtereum // node GasPriceIncPerc int64 // TxManagerCheckInterval is the waiting interval between receipt // checks of ethereum transactions in the TxManager TxManagerCheckInterval time.Duration // DebugBatchPath if set, specifies the path where batchInfo is stored // in JSON in every step/update of the pipeline DebugBatchPath string Purger PurgerCfg // VerifierIdx is the index of the verifier contract registered in the // smart contract VerifierIdx uint8 // ForgeBatchGasCost contains the cost of each action in the // ForgeBatch transaction. ForgeBatchGasCost config.ForgeBatchGasCost TxProcessorConfig txprocessor.Config } func (c *Config) debugBatchStore(batchInfo *BatchInfo) { if c.DebugBatchPath != "" { if err := batchInfo.DebugStore(c.DebugBatchPath); err != nil { log.Warnw("Error storing debug BatchInfo", "path", c.DebugBatchPath, "err", err) } } } type fromBatch struct { BatchNum common.BatchNum ForgerAddr ethCommon.Address StateRoot *big.Int } // Coordinator implements the Coordinator type type Coordinator struct { // State pipelineNum int // Pipeline sequential number. The first pipeline is 1 pipelineFromBatch fromBatch // batch from which we started the pipeline provers []prover.Client consts synchronizer.SCConsts vars synchronizer.SCVariables stats synchronizer.Stats started bool cfg Config historyDB *historydb.HistoryDB l2DB *l2db.L2DB txSelector *txselector.TxSelector batchBuilder *batchbuilder.BatchBuilder msgCh chan interface{} ctx context.Context wg sync.WaitGroup cancel context.CancelFunc // mutexL2DBUpdateDelete protects updates to the L2DB so that // these two processes always happen exclusively: // - Pipeline taking pending txs, running through the TxProcessor and // marking selected txs as forging // - Coordinator deleting pending txs that have been marked with // `external_delete`. // Without this mutex, the coordinator could delete a pending txs that // has just been selected by the TxProcessor in the pipeline. mutexL2DBUpdateDelete sync.Mutex pipeline *Pipeline lastNonFailedBatchNum common.BatchNum purger *Purger txManager *TxManager } // NewCoordinator creates a new Coordinator func NewCoordinator(cfg Config, historyDB *historydb.HistoryDB, l2DB *l2db.L2DB, txSelector *txselector.TxSelector, batchBuilder *batchbuilder.BatchBuilder, serverProofs []prover.Client, ethClient eth.ClientInterface, scConsts *common.SCConsts, initSCVars *common.SCVariables, ) (*Coordinator, error) { // nolint reason: hardcoded `1.0`, by design the percentage can't be over 100% if cfg.L1BatchTimeoutPerc >= 1.0 { //nolint:gomnd return nil, tracerr.Wrap(fmt.Errorf("invalid value for Config.L1BatchTimeoutPerc (%v >= 1.0)", cfg.L1BatchTimeoutPerc)) } if cfg.EthClientAttempts < 1 { return nil, tracerr.Wrap(fmt.Errorf("invalid value for Config.EthClientAttempts (%v < 1)", cfg.EthClientAttempts)) } if cfg.DebugBatchPath != "" { if err := os.MkdirAll(cfg.DebugBatchPath, 0744); err != nil { return nil, tracerr.Wrap(err) } } purger := Purger{ cfg: cfg.Purger, lastPurgeBlock: 0, lastPurgeBatch: 0, lastInvalidateBlock: 0, lastInvalidateBatch: 0, } ctx, cancel := context.WithCancel(context.Background()) c := Coordinator{ pipelineNum: 0, pipelineFromBatch: fromBatch{ BatchNum: 0, ForgerAddr: ethCommon.Address{}, StateRoot: big.NewInt(0), }, provers: serverProofs, consts: *scConsts, vars: *initSCVars, cfg: cfg, historyDB: historyDB, l2DB: l2DB, txSelector: txSelector, batchBuilder: batchBuilder, purger: &purger, msgCh: make(chan interface{}), ctx: ctx, // wg cancel: cancel, } ctxTimeout, ctxTimeoutCancel := context.WithTimeout(ctx, 1*time.Second) defer ctxTimeoutCancel() txManager, err := NewTxManager(ctxTimeout, &cfg, ethClient, l2DB, &c, scConsts, initSCVars) if err != nil { return nil, tracerr.Wrap(err) } c.txManager = txManager // Set Eth LastBlockNum to -1 in stats so that stats.Synced() is // guaranteed to return false before it's updated with a real stats c.stats.Eth.LastBlock.Num = -1 return &c, nil } // TxSelector returns the inner TxSelector func (c *Coordinator) TxSelector() *txselector.TxSelector { return c.txSelector } // BatchBuilder returns the inner BatchBuilder func (c *Coordinator) BatchBuilder() *batchbuilder.BatchBuilder { return c.batchBuilder } func (c *Coordinator) newPipeline(ctx context.Context) (*Pipeline, error) { c.pipelineNum++ return NewPipeline(ctx, c.cfg, c.pipelineNum, c.historyDB, c.l2DB, c.txSelector, c.batchBuilder, &c.mutexL2DBUpdateDelete, c.purger, c, c.txManager, c.provers, &c.consts) } // MsgSyncBlock indicates an update to the Synchronizer stats type MsgSyncBlock struct { Stats synchronizer.Stats Batches []common.BatchData // Vars contains each Smart Contract variables if they are updated, or // nil if they haven't changed. Vars synchronizer.SCVariablesPtr } // MsgSyncReorg indicates a reorg type MsgSyncReorg struct { Stats synchronizer.Stats Vars synchronizer.SCVariablesPtr } // MsgStopPipeline indicates a signal to reset the pipeline type MsgStopPipeline struct { Reason string // FailedBatchNum indicates the first batchNum that failed in the // pipeline. If FailedBatchNum is 0, it should be ignored. FailedBatchNum common.BatchNum } // SendMsg is a thread safe method to pass a message to the Coordinator func (c *Coordinator) SendMsg(ctx context.Context, msg interface{}) { select { case c.msgCh <- msg: case <-ctx.Done(): } } func updateSCVars(vars *synchronizer.SCVariables, update synchronizer.SCVariablesPtr) { if update.Rollup != nil { vars.Rollup = *update.Rollup } if update.Auction != nil { vars.Auction = *update.Auction } if update.WDelayer != nil { vars.WDelayer = *update.WDelayer } } func (c *Coordinator) syncSCVars(vars synchronizer.SCVariablesPtr) { updateSCVars(&c.vars, vars) } func canForge(auctionConstants *common.AuctionConstants, auctionVars *common.AuctionVariables, currentSlot *common.Slot, nextSlot *common.Slot, addr ethCommon.Address, blockNum int64) bool { if blockNum < auctionConstants.GenesisBlockNum { log.Infow("canForge: requested blockNum is < genesis", "blockNum", blockNum, "genesis", auctionConstants.GenesisBlockNum) return false } var slot *common.Slot if currentSlot.StartBlock <= blockNum && blockNum <= currentSlot.EndBlock { slot = currentSlot } else if nextSlot.StartBlock <= blockNum && blockNum <= nextSlot.EndBlock { slot = nextSlot } else { log.Warnw("canForge: requested blockNum is outside current and next slot", "blockNum", blockNum, "currentSlot", currentSlot, "nextSlot", nextSlot, ) return false } anyoneForge := false if !slot.ForgerCommitment && auctionConstants.RelativeBlock(blockNum) >= int64(auctionVars.SlotDeadline) { log.Debugw("canForge: anyone can forge in the current slot (slotDeadline passed)", "block", blockNum) anyoneForge = true } if slot.Forger == addr || anyoneForge { return true } log.Debugw("canForge: can't forge", "slot.Forger", slot.Forger) return false } func (c *Coordinator) canForgeAt(blockNum int64) bool { return canForge(&c.consts.Auction, &c.vars.Auction, &c.stats.Sync.Auction.CurrentSlot, &c.stats.Sync.Auction.NextSlot, c.cfg.ForgerAddress, blockNum) } func (c *Coordinator) canForge() bool { blockNum := c.stats.Eth.LastBlock.Num + 1 return canForge(&c.consts.Auction, &c.vars.Auction, &c.stats.Sync.Auction.CurrentSlot, &c.stats.Sync.Auction.NextSlot, c.cfg.ForgerAddress, blockNum) } func (c *Coordinator) syncStats(ctx context.Context, stats *synchronizer.Stats) error { nextBlock := c.stats.Eth.LastBlock.Num + 1 canForge := c.canForgeAt(nextBlock) if c.cfg.ScheduleBatchBlocksAheadCheck != 0 && canForge { canForge = c.canForgeAt(nextBlock + c.cfg.ScheduleBatchBlocksAheadCheck) } if c.pipeline == nil { relativeBlock := c.consts.Auction.RelativeBlock(nextBlock) if canForge && relativeBlock < c.cfg.StartSlotBlocksDelay { log.Debugf("Coordinator: delaying pipeline start due to "+ "relativeBlock (%v) < cfg.StartSlotBlocksDelay (%v)", relativeBlock, c.cfg.StartSlotBlocksDelay) } else if canForge { log.Infow("Coordinator: forging state begin", "block", stats.Eth.LastBlock.Num+1, "batch", stats.Sync.LastBatch.BatchNum) fromBatch := fromBatch{ BatchNum: stats.Sync.LastBatch.BatchNum, ForgerAddr: stats.Sync.LastBatch.ForgerAddr, StateRoot: stats.Sync.LastBatch.StateRoot, } if c.lastNonFailedBatchNum > fromBatch.BatchNum { fromBatch.BatchNum = c.lastNonFailedBatchNum fromBatch.ForgerAddr = c.cfg.ForgerAddress fromBatch.StateRoot = big.NewInt(0) } // Before starting the pipeline make sure we reset any // l2tx from the pool that was forged in a batch that // didn't end up being mined. We are already doing // this in handleStopPipeline, but we do it again as a // failsafe in case the last synced batchnum is // different than in the previous call to l2DB.Reorg, // or in case the node was restarted when there was a // started batch that included l2txs but was not mined. if err := c.l2DB.Reorg(fromBatch.BatchNum); err != nil { return tracerr.Wrap(err) } var err error if c.pipeline, err = c.newPipeline(ctx); err != nil { return tracerr.Wrap(err) } c.pipelineFromBatch = fromBatch // Start the pipeline if err := c.pipeline.Start(fromBatch.BatchNum, stats, &c.vars); err != nil { c.pipeline = nil return tracerr.Wrap(err) } } } else { if !canForge { log.Infow("Coordinator: forging state end", "block", stats.Eth.LastBlock.Num+1) c.pipeline.Stop(c.ctx) c.pipeline = nil } } if c.pipeline == nil { if _, err := c.purger.InvalidateMaybe(c.l2DB, c.txSelector.LocalAccountsDB(), stats.Sync.LastBlock.Num, int64(stats.Sync.LastBatch.BatchNum)); err != nil { return tracerr.Wrap(err) } if _, err := c.purger.PurgeMaybe(c.l2DB, stats.Sync.LastBlock.Num, int64(stats.Sync.LastBatch.BatchNum)); err != nil { return tracerr.Wrap(err) } } return nil } func (c *Coordinator) handleMsgSyncBlock(ctx context.Context, msg *MsgSyncBlock) error { c.stats = msg.Stats c.syncSCVars(msg.Vars) c.txManager.SetSyncStatsVars(ctx, &msg.Stats, &msg.Vars) if c.pipeline != nil { c.pipeline.SetSyncStatsVars(ctx, &msg.Stats, &msg.Vars) } if !c.stats.Synced() { return nil } return c.syncStats(ctx, &c.stats) } func (c *Coordinator) handleReorg(ctx context.Context, msg *MsgSyncReorg) error { c.stats = msg.Stats c.syncSCVars(msg.Vars) c.txManager.SetSyncStatsVars(ctx, &msg.Stats, &msg.Vars) if c.pipeline != nil { c.pipeline.SetSyncStatsVars(ctx, &msg.Stats, &msg.Vars) } if c.stats.Sync.LastBatch.ForgerAddr != c.cfg.ForgerAddress && (c.stats.Sync.LastBatch.StateRoot == nil || c.pipelineFromBatch.StateRoot == nil || c.stats.Sync.LastBatch.StateRoot.Cmp(c.pipelineFromBatch.StateRoot) != 0) { // There's been a reorg and the batch state root from which the // pipeline was started has changed (probably because it was in // a block that was discarded), and it was sent by a different // coordinator than us. That batch may never be in the main // chain, so we stop the pipeline (it will be started again // once the node is in sync). log.Infow("Coordinator.handleReorg StopPipeline sync.LastBatch.ForgerAddr != cfg.ForgerAddr "+ "& sync.LastBatch.StateRoot != pipelineFromBatch.StateRoot", "sync.LastBatch.StateRoot", c.stats.Sync.LastBatch.StateRoot, "pipelineFromBatch.StateRoot", c.pipelineFromBatch.StateRoot) c.txManager.DiscardPipeline(ctx, c.pipelineNum) if err := c.handleStopPipeline(ctx, "reorg", 0); err != nil { return tracerr.Wrap(err) } } return nil } // handleStopPipeline handles stopping the pipeline. If failedBatchNum is 0, // the next pipeline will start from the last state of the synchronizer, // otherwise, it will state from failedBatchNum-1. func (c *Coordinator) handleStopPipeline(ctx context.Context, reason string, failedBatchNum common.BatchNum) error { batchNum := c.stats.Sync.LastBatch.BatchNum if failedBatchNum != 0 { batchNum = failedBatchNum - 1 } if c.pipeline != nil { c.pipeline.Stop(c.ctx) c.pipeline = nil } if err := c.l2DB.Reorg(batchNum); err != nil { return tracerr.Wrap(err) } c.lastNonFailedBatchNum = batchNum return nil } func (c *Coordinator) handleMsg(ctx context.Context, msg interface{}) error { switch msg := msg.(type) { case MsgSyncBlock: if err := c.handleMsgSyncBlock(ctx, &msg); err != nil { return tracerr.Wrap(fmt.Errorf("Coordinator.handleMsgSyncBlock error: %w", err)) } case MsgSyncReorg: if err := c.handleReorg(ctx, &msg); err != nil { return tracerr.Wrap(fmt.Errorf("Coordinator.handleReorg error: %w", err)) } case MsgStopPipeline: log.Infow("Coordinator received MsgStopPipeline", "reason", msg.Reason) if err := c.handleStopPipeline(ctx, msg.Reason, msg.FailedBatchNum); err != nil { return tracerr.Wrap(fmt.Errorf("Coordinator.handleStopPipeline: %w", err)) } default: log.Fatalw("Coordinator Unexpected Coordinator msg of type %T: %+v", msg, msg) } return nil } // Start the coordinator func (c *Coordinator) Start() { if c.started { log.Fatal("Coordinator already started") } c.started = true c.wg.Add(1) go func() { c.txManager.Run(c.ctx) c.wg.Done() }() c.wg.Add(1) go func() { timer := time.NewTimer(longWaitDuration) for { select { case <-c.ctx.Done(): log.Info("Coordinator done") c.wg.Done() return case msg := <-c.msgCh: if err := c.handleMsg(c.ctx, msg); c.ctx.Err() != nil { continue } else if err != nil { log.Errorw("Coordinator.handleMsg", "err", err) if !timer.Stop() { <-timer.C } timer.Reset(c.cfg.SyncRetryInterval) continue } case <-timer.C: timer.Reset(longWaitDuration) if !c.stats.Synced() { continue } if err := c.syncStats(c.ctx, &c.stats); c.ctx.Err() != nil { continue } else if err != nil { log.Errorw("Coordinator.syncStats", "err", err) if !timer.Stop() { <-timer.C } timer.Reset(c.cfg.SyncRetryInterval) continue } } } }() c.wg.Add(1) go func() { for { select { case <-c.ctx.Done(): log.Info("Coordinator L2DB.PurgeByExternalDelete loop done") c.wg.Done() return case <-time.After(c.cfg.PurgeByExtDelInterval): c.mutexL2DBUpdateDelete.Lock() if err := c.l2DB.PurgeByExternalDelete(); err != nil { log.Errorw("L2DB.PurgeByExternalDelete", "err", err) } c.mutexL2DBUpdateDelete.Unlock() } } }() } const stopCtxTimeout = 200 * time.Millisecond // Stop the coordinator func (c *Coordinator) Stop() { if !c.started { log.Fatal("Coordinator already stopped") } c.started = false log.Infow("Stopping Coordinator...") c.cancel() c.wg.Wait() if c.pipeline != nil { ctx, cancel := context.WithTimeout(context.Background(), stopCtxTimeout) defer cancel() c.pipeline.Stop(ctx) c.pipeline = nil } }