arnaucube
/
hermez-node
mirror of https://github.com/arnaucube/hermez-node.git

package coordinator

import (
	"context"
	"fmt"
	"math/big"
	"os"
	"sync"
	"time"

	ethCommon "github.com/ethereum/go-ethereum/common"
	"github.com/hermeznetwork/hermez-node/batchbuilder"
	"github.com/hermeznetwork/hermez-node/common"
	"github.com/hermeznetwork/hermez-node/config"
	"github.com/hermeznetwork/hermez-node/db/historydb"
	"github.com/hermeznetwork/hermez-node/db/l2db"
	"github.com/hermeznetwork/hermez-node/eth"
	"github.com/hermeznetwork/hermez-node/log"
	"github.com/hermeznetwork/hermez-node/prover"
	"github.com/hermeznetwork/hermez-node/synchronizer"
	"github.com/hermeznetwork/hermez-node/txprocessor"
	"github.com/hermeznetwork/hermez-node/txselector"
	"github.com/hermeznetwork/tracerr"
)

var (
	errLastL1BatchNotSynced  = fmt.Errorf("last L1Batch not synced yet")
	errForgeNoTxsBeforeDelay = fmt.Errorf("no txs to forge and we haven't reached the forge no txs delay")
	errForgeBeforeDelay      = fmt.Errorf("we haven't reached the forge delay")
)

const (
	queueLen         = 16
	longWaitDuration = 999 * time.Hour
	zeroDuration     = 0 * time.Second
)

// Config contains the Coordinator configuration
type Config struct {
	// ForgerAddress is the address under which this coordinator is forging
	ForgerAddress ethCommon.Address
	// ConfirmBlocks is the number of confirmation blocks to wait for sent
	// ethereum transactions before forgetting about them
	ConfirmBlocks int64
	// L1BatchTimeoutPerc is the portion of the range before the L1Batch
	// timeout that will trigger a schedule to forge an L1Batch
	L1BatchTimeoutPerc float64
	// StartSlotBlocksDelay is the number of blocks of delay to wait before
	// starting the pipeline when we reach a slot in which we can forge.
	StartSlotBlocksDelay int64
	// ScheduleBatchBlocksAheadCheck is the number of blocks ahead in which
	// the forger address is checked to be allowed to forge (apart from
	// checking the next block), used to decide when to stop scheduling new
	// batches (by stopping the pipeline).
	// For example, if we are at block 10 and ScheduleBatchBlocksAheadCheck
	// is 5, eventhough at block 11 we canForge, the pipeline will be
	// stopped if we can't forge at block 15.
	// This value should be the expected number of blocks it takes between
	// scheduling a batch and having it mined.
	ScheduleBatchBlocksAheadCheck int64
	// SendBatchBlocksMarginCheck is the number of margin blocks ahead in
	// which the coordinator is also checked to be allowed to forge, apart
	// from the next block; used to decide when to stop sending batches to
	// the smart contract.
	// For example, if we are at block 10 and SendBatchBlocksMarginCheck is
	// 5, eventhough at block 11 we canForge, the batch will be discarded
	// if we can't forge at block 15.
	// This value should be the expected number of blocks it takes between
	// sending a batch and having it mined.
	SendBatchBlocksMarginCheck int64
	// EthClientAttempts is the number of attempts to do an eth client RPC
	// call before giving up
	EthClientAttempts int
	// ForgeRetryInterval is the waiting interval between calls forge a
	// batch after an error
	ForgeRetryInterval time.Duration
	// ForgeDelay is the delay after which a batch is forged if the slot is
	// already committed.  If set to 0s, the coordinator will continuously
	// forge at the maximum rate.
	ForgeDelay time.Duration
	// ForgeNoTxsDelay is the delay after which a batch is forged even if
	// there are no txs to forge if the slot is already committed.  If set
	// to 0s, the coordinator will continuously forge even if the batches
	// are empty.
	ForgeNoTxsDelay time.Duration
	// SyncRetryInterval is the waiting interval between calls to the main
	// handler of a synced block after an error
	SyncRetryInterval time.Duration
	// PurgeByExtDelInterval is the waiting interval between calls
	// to the PurgeByExternalDelete function of the l2db which deletes
	// pending txs externally marked by the column `external_delete`
	PurgeByExtDelInterval time.Duration
	// EthClientAttemptsDelay is delay between attempts do do an eth client
	// RPC call
	EthClientAttemptsDelay time.Duration
	// EthTxResendTimeout is the timeout after which a non-mined ethereum
	// transaction will be resent (reusing the nonce) with a newly
	// calculated gas price
	EthTxResendTimeout time.Duration
	// EthNoReuseNonce disables reusing nonces of pending transactions for
	// new replacement transactions
	EthNoReuseNonce bool
	// MaxGasPrice is the maximum gas price allowed for ethereum
	// transactions
	MaxGasPrice *big.Int
	// GasPriceIncPerc is the percentage increase of gas price set in an
	// ethereum transaction from the suggested gas price by the ehtereum
	// node
	GasPriceIncPerc int64
	// TxManagerCheckInterval is the waiting interval between receipt
	// checks of ethereum transactions in the TxManager
	TxManagerCheckInterval time.Duration
	// DebugBatchPath if set, specifies the path where batchInfo is stored
	// in JSON in every step/update of the pipeline
	DebugBatchPath string
	Purger         PurgerCfg
	// VerifierIdx is the index of the verifier contract registered in the
	// smart contract
	VerifierIdx uint8
	// ForgeBatchGasCost contains the cost of each action in the
	// ForgeBatch transaction.
	ForgeBatchGasCost config.ForgeBatchGasCost
	TxProcessorConfig txprocessor.Config
}

func (c *Config) debugBatchStore(batchInfo *BatchInfo) {
	if c.DebugBatchPath != "" {
		if err := batchInfo.DebugStore(c.DebugBatchPath); err != nil {
			log.Warnw("Error storing debug BatchInfo",
				"path", c.DebugBatchPath, "err", err)
		}
	}
}

type fromBatch struct {
	BatchNum   common.BatchNum
	ForgerAddr ethCommon.Address
	StateRoot  *big.Int
}

// Coordinator implements the Coordinator type
type Coordinator struct {
	// State
	pipelineNum       int       // Pipeline sequential number.  The first pipeline is 1
	pipelineFromBatch fromBatch // batch from which we started the pipeline
	provers           []prover.Client
	consts            common.SCConsts
	vars              common.SCVariables
	stats             synchronizer.Stats
	started           bool

	cfg Config

	historyDB    *historydb.HistoryDB
	l2DB         *l2db.L2DB
	txSelector   *txselector.TxSelector
	batchBuilder *batchbuilder.BatchBuilder

	msgCh  chan interface{}
	ctx    context.Context
	wg     sync.WaitGroup
	cancel context.CancelFunc

	// mutexL2DBUpdateDelete protects updates to the L2DB so that
	// these two processes always happen exclusively:
	// - Pipeline taking pending txs, running through the TxProcessor and
	//   marking selected txs as forging
	// - Coordinator deleting pending txs that have been marked with
	//   `external_delete`.
	// Without this mutex, the coordinator could delete a pending txs that
	// has just been selected by the TxProcessor in the pipeline.
	mutexL2DBUpdateDelete sync.Mutex
	pipeline              *Pipeline
	lastNonFailedBatchNum common.BatchNum

	purger    *Purger
	txManager *TxManager
}

// NewCoordinator creates a new Coordinator
func NewCoordinator(cfg Config,
	historyDB *historydb.HistoryDB,
	l2DB *l2db.L2DB,
	txSelector *txselector.TxSelector,
	batchBuilder *batchbuilder.BatchBuilder,
	serverProofs []prover.Client,
	ethClient eth.ClientInterface,
	scConsts *common.SCConsts,
	initSCVars *common.SCVariables,
) (*Coordinator, error) {
	// nolint reason: hardcoded `1.0`, by design the percentage can't be over 100%
	if cfg.L1BatchTimeoutPerc >= 1.0 { //nolint:gomnd
		return nil, tracerr.Wrap(fmt.Errorf("invalid value for Config.L1BatchTimeoutPerc (%v >= 1.0)",
			cfg.L1BatchTimeoutPerc))
	}
	if cfg.EthClientAttempts < 1 {
		return nil, tracerr.Wrap(fmt.Errorf("invalid value for Config.EthClientAttempts (%v < 1)",
			cfg.EthClientAttempts))
	}

	if cfg.DebugBatchPath != "" {
		if err := os.MkdirAll(cfg.DebugBatchPath, 0744); err != nil {
			return nil, tracerr.Wrap(err)
		}
	}

	purger := Purger{
		cfg:                 cfg.Purger,
		lastPurgeBlock:      0,
		lastPurgeBatch:      0,
		lastInvalidateBlock: 0,
		lastInvalidateBatch: 0,
	}

	ctx, cancel := context.WithCancel(context.Background())
	c := Coordinator{
		pipelineNum: 0,
		pipelineFromBatch: fromBatch{
			BatchNum:   0,
			ForgerAddr: ethCommon.Address{},
			StateRoot:  big.NewInt(0),
		},
		provers: serverProofs,
		consts:  *scConsts,
		vars:    *initSCVars,

		cfg: cfg,

		historyDB:    historyDB,
		l2DB:         l2DB,
		txSelector:   txSelector,
		batchBuilder: batchBuilder,

		purger: &purger,

		msgCh: make(chan interface{}),
		ctx:   ctx,
		// wg
		cancel: cancel,
	}
	ctxTimeout, ctxTimeoutCancel := context.WithTimeout(ctx, 1*time.Second)
	defer ctxTimeoutCancel()
	txManager, err := NewTxManager(ctxTimeout, &cfg, ethClient, l2DB, &c,
		scConsts, initSCVars)
	if err != nil {
		return nil, tracerr.Wrap(err)
	}
	c.txManager = txManager
	// Set Eth LastBlockNum to -1 in stats so that stats.Synced() is
	// guaranteed to return false before it's updated with a real stats
	c.stats.Eth.LastBlock.Num = -1
	return &c, nil
}

// TxSelector returns the inner TxSelector
func (c *Coordinator) TxSelector() *txselector.TxSelector {
	return c.txSelector
}

// BatchBuilder returns the inner BatchBuilder
func (c *Coordinator) BatchBuilder() *batchbuilder.BatchBuilder {
	return c.batchBuilder
}

func (c *Coordinator) newPipeline(ctx context.Context) (*Pipeline, error) {
	c.pipelineNum++
	return NewPipeline(ctx, c.cfg, c.pipelineNum, c.historyDB, c.l2DB, c.txSelector,
		c.batchBuilder, &c.mutexL2DBUpdateDelete, c.purger, c, c.txManager,
		c.provers, &c.consts)
}

// MsgSyncBlock indicates an update to the Synchronizer stats
type MsgSyncBlock struct {
	Stats   synchronizer.Stats
	Batches []common.BatchData
	// Vars contains each Smart Contract variables if they are updated, or
	// nil if they haven't changed.
	Vars common.SCVariablesPtr
}

// MsgSyncReorg indicates a reorg
type MsgSyncReorg struct {
	Stats synchronizer.Stats
	Vars  common.SCVariablesPtr
}

// MsgStopPipeline indicates a signal to reset the pipeline
type MsgStopPipeline struct {
	Reason string
	// FailedBatchNum indicates the first batchNum that failed in the
	// pipeline.  If FailedBatchNum is 0, it should be ignored.
	FailedBatchNum common.BatchNum
}

// SendMsg is a thread safe method to pass a message to the Coordinator
func (c *Coordinator) SendMsg(ctx context.Context, msg interface{}) {
	select {
	case c.msgCh <- msg:
	case <-ctx.Done():
	}
}

func updateSCVars(vars *common.SCVariables, update common.SCVariablesPtr) {
	if update.Rollup != nil {
		vars.Rollup = *update.Rollup
	}
	if update.Auction != nil {
		vars.Auction = *update.Auction
	}
	if update.WDelayer != nil {
		vars.WDelayer = *update.WDelayer
	}
}

func (c *Coordinator) syncSCVars(vars common.SCVariablesPtr) {
	updateSCVars(&c.vars, vars)
}

func canForge(auctionConstants *common.AuctionConstants, auctionVars *common.AuctionVariables,
	currentSlot *common.Slot, nextSlot *common.Slot, addr ethCommon.Address, blockNum int64) bool {
	if blockNum < auctionConstants.GenesisBlockNum {
		log.Infow("canForge: requested blockNum is < genesis", "blockNum", blockNum,
			"genesis", auctionConstants.GenesisBlockNum)
		return false
	}
	var slot *common.Slot
	if currentSlot.StartBlock <= blockNum && blockNum <= currentSlot.EndBlock {
		slot = currentSlot
	} else if nextSlot.StartBlock <= blockNum && blockNum <= nextSlot.EndBlock {
		slot = nextSlot
	} else {
		log.Warnw("canForge: requested blockNum is outside current and next slot",
			"blockNum", blockNum, "currentSlot", currentSlot,
			"nextSlot", nextSlot,
		)
		return false
	}
	anyoneForge := false
	if !slot.ForgerCommitment &&
		auctionConstants.RelativeBlock(blockNum) >= int64(auctionVars.SlotDeadline) {
		log.Debugw("canForge: anyone can forge in the current slot (slotDeadline passed)",
			"block", blockNum)
		anyoneForge = true
	}
	if slot.Forger == addr || anyoneForge {
		return true
	}
	log.Debugw("canForge: can't forge", "slot.Forger", slot.Forger)
	return false
}

func (c *Coordinator) canForgeAt(blockNum int64) bool {
	return canForge(&c.consts.Auction, &c.vars.Auction,
		&c.stats.Sync.Auction.CurrentSlot, &c.stats.Sync.Auction.NextSlot,
		c.cfg.ForgerAddress, blockNum)
}

func (c *Coordinator) canForge() bool {
	blockNum := c.stats.Eth.LastBlock.Num + 1
	return canForge(&c.consts.Auction, &c.vars.Auction,
		&c.stats.Sync.Auction.CurrentSlot, &c.stats.Sync.Auction.NextSlot,
		c.cfg.ForgerAddress, blockNum)
}

func (c *Coordinator) syncStats(ctx context.Context, stats *synchronizer.Stats) error {
	nextBlock := c.stats.Eth.LastBlock.Num + 1
	canForge := c.canForgeAt(nextBlock)
	if c.cfg.ScheduleBatchBlocksAheadCheck != 0 && canForge {
		canForge = c.canForgeAt(nextBlock + c.cfg.ScheduleBatchBlocksAheadCheck)
	}
	if c.pipeline == nil {
		relativeBlock := c.consts.Auction.RelativeBlock(nextBlock)
		if canForge && relativeBlock < c.cfg.StartSlotBlocksDelay {
			log.Debugf("Coordinator: delaying pipeline start due to "+
				"relativeBlock (%v) < cfg.StartSlotBlocksDelay (%v)",
				relativeBlock, c.cfg.StartSlotBlocksDelay)
		} else if canForge {
			log.Infow("Coordinator: forging state begin", "block",
				stats.Eth.LastBlock.Num+1, "batch", stats.Sync.LastBatch.BatchNum)
			fromBatch := fromBatch{
				BatchNum:   stats.Sync.LastBatch.BatchNum,
				ForgerAddr: stats.Sync.LastBatch.ForgerAddr,
				StateRoot:  stats.Sync.LastBatch.StateRoot,
			}
			if c.lastNonFailedBatchNum > fromBatch.BatchNum {
				fromBatch.BatchNum = c.lastNonFailedBatchNum
				fromBatch.ForgerAddr = c.cfg.ForgerAddress
				fromBatch.StateRoot = big.NewInt(0)
			}
			// Before starting the pipeline make sure we reset any
			// l2tx from the pool that was forged in a batch that
			// didn't end up being mined.  We are already doing
			// this in handleStopPipeline, but we do it again as a
			// failsafe in case the last synced batchnum is
			// different than in the previous call to l2DB.Reorg,
			// or in case the node was restarted when there was a
			// started batch that included l2txs but was not mined.
			if err := c.l2DB.Reorg(fromBatch.BatchNum); err != nil {
				return tracerr.Wrap(err)
			}
			var err error
			if c.pipeline, err = c.newPipeline(ctx); err != nil {
				return tracerr.Wrap(err)
			}
			c.pipelineFromBatch = fromBatch
			// Start the pipeline
			if err := c.pipeline.Start(fromBatch.BatchNum, stats, &c.vars); err != nil {
				c.pipeline = nil
				return tracerr.Wrap(err)
			}
		}
	} else {
		if !canForge {
			log.Infow("Coordinator: forging state end", "block", stats.Eth.LastBlock.Num+1)
			c.pipeline.Stop(c.ctx)
			c.pipeline = nil
		}
	}
	if c.pipeline == nil {
		if _, err := c.purger.InvalidateMaybe(c.l2DB, c.txSelector.LocalAccountsDB(),
			stats.Sync.LastBlock.Num, int64(stats.Sync.LastBatch.BatchNum)); err != nil {
			return tracerr.Wrap(err)
		}
		if _, err := c.purger.PurgeMaybe(c.l2DB, stats.Sync.LastBlock.Num,
			int64(stats.Sync.LastBatch.BatchNum)); err != nil {
			return tracerr.Wrap(err)
		}
	}
	return nil
}

func (c *Coordinator) handleMsgSyncBlock(ctx context.Context, msg *MsgSyncBlock) error {
	c.stats = msg.Stats
	c.syncSCVars(msg.Vars)
	c.txManager.SetSyncStatsVars(ctx, &msg.Stats, &msg.Vars)
	if c.pipeline != nil {
		c.pipeline.SetSyncStatsVars(ctx, &msg.Stats, &msg.Vars)
	}
	if !c.stats.Synced() {
		return nil
	}
	return c.syncStats(ctx, &c.stats)
}

func (c *Coordinator) handleReorg(ctx context.Context, msg *MsgSyncReorg) error {
	c.stats = msg.Stats
	c.syncSCVars(msg.Vars)
	c.txManager.SetSyncStatsVars(ctx, &msg.Stats, &msg.Vars)
	if c.pipeline != nil {
		c.pipeline.SetSyncStatsVars(ctx, &msg.Stats, &msg.Vars)
	}
	if c.stats.Sync.LastBatch.ForgerAddr != c.cfg.ForgerAddress &&
		(c.stats.Sync.LastBatch.StateRoot == nil || c.pipelineFromBatch.StateRoot == nil ||
			c.stats.Sync.LastBatch.StateRoot.Cmp(c.pipelineFromBatch.StateRoot) != 0) {
		// There's been a reorg and the batch state root from which the
		// pipeline was started has changed (probably because it was in
		// a block that was discarded), and it was sent by a different
		// coordinator than us.  That batch may never be in the main
		// chain, so we stop the pipeline  (it will be started again
		// once the node is in sync).
		log.Infow("Coordinator.handleReorg StopPipeline sync.LastBatch.ForgerAddr != cfg.ForgerAddr "+
			"& sync.LastBatch.StateRoot != pipelineFromBatch.StateRoot",
			"sync.LastBatch.StateRoot", c.stats.Sync.LastBatch.StateRoot,
			"pipelineFromBatch.StateRoot", c.pipelineFromBatch.StateRoot)
		c.txManager.DiscardPipeline(ctx, c.pipelineNum)
		if err := c.handleStopPipeline(ctx, "reorg", 0); err != nil {
			return tracerr.Wrap(err)
		}
	}
	return nil
}

// handleStopPipeline handles stopping the pipeline.  If failedBatchNum is 0,
// the next pipeline will start from the last state of the synchronizer,
// otherwise, it will state from failedBatchNum-1.
func (c *Coordinator) handleStopPipeline(ctx context.Context, reason string, failedBatchNum common.BatchNum) error {
	batchNum := c.stats.Sync.LastBatch.BatchNum
	if failedBatchNum != 0 {
		batchNum = failedBatchNum - 1
	}
	if c.pipeline != nil {
		c.pipeline.Stop(c.ctx)
		c.pipeline = nil
	}
	if err := c.l2DB.Reorg(batchNum); err != nil {
		return tracerr.Wrap(err)
	}
	c.lastNonFailedBatchNum = batchNum
	return nil
}

func (c *Coordinator) handleMsg(ctx context.Context, msg interface{}) error {
	switch msg := msg.(type) {
	case MsgSyncBlock:
		if err := c.handleMsgSyncBlock(ctx, &msg); err != nil {
			return tracerr.Wrap(fmt.Errorf("Coordinator.handleMsgSyncBlock error: %w", err))
		}
	case MsgSyncReorg:
		if err := c.handleReorg(ctx, &msg); err != nil {
			return tracerr.Wrap(fmt.Errorf("Coordinator.handleReorg error: %w", err))
		}
	case MsgStopPipeline:
		log.Infow("Coordinator received MsgStopPipeline", "reason", msg.Reason)
		if err := c.handleStopPipeline(ctx, msg.Reason, msg.FailedBatchNum); err != nil {
			return tracerr.Wrap(fmt.Errorf("Coordinator.handleStopPipeline: %w", err))
		}
	default:
		log.Fatalw("Coordinator Unexpected Coordinator msg of type %T: %+v", msg, msg)
	}
	return nil
}

// Start the coordinator
func (c *Coordinator) Start() {
	if c.started {
		log.Fatal("Coordinator already started")
	}
	c.started = true
	c.wg.Add(1)
	go func() {
		c.txManager.Run(c.ctx)
		c.wg.Done()
	}()

	c.wg.Add(1)
	go func() {
		timer := time.NewTimer(longWaitDuration)
		for {
			select {
			case <-c.ctx.Done():
				log.Info("Coordinator done")
				c.wg.Done()
				return
			case msg := <-c.msgCh:
				if err := c.handleMsg(c.ctx, msg); c.ctx.Err() != nil {
					continue
				} else if err != nil {
					log.Errorw("Coordinator.handleMsg", "err", err)
					if !timer.Stop() {
						<-timer.C
					}
					timer.Reset(c.cfg.SyncRetryInterval)
					continue
				}
			case <-timer.C:
				timer.Reset(longWaitDuration)
				if !c.stats.Synced() {
					continue
				}
				if err := c.syncStats(c.ctx, &c.stats); c.ctx.Err() != nil {
					continue
				} else if err != nil {
					log.Errorw("Coordinator.syncStats", "err", err)
					if !timer.Stop() {
						<-timer.C
					}
					timer.Reset(c.cfg.SyncRetryInterval)
					continue
				}
			}
		}
	}()

	c.wg.Add(1)
	go func() {
		for {
			select {
			case <-c.ctx.Done():
				log.Info("Coordinator L2DB.PurgeByExternalDelete loop done")
				c.wg.Done()
				return
			case <-time.After(c.cfg.PurgeByExtDelInterval):
				c.mutexL2DBUpdateDelete.Lock()
				if err := c.l2DB.PurgeByExternalDelete(); err != nil {
					log.Errorw("L2DB.PurgeByExternalDelete", "err", err)
				}
				c.mutexL2DBUpdateDelete.Unlock()
			}
		}
	}()
}

const stopCtxTimeout = 200 * time.Millisecond

// Stop the coordinator
func (c *Coordinator) Stop() {
	if !c.started {
		log.Fatal("Coordinator already stopped")
	}
	c.started = false
	log.Infow("Stopping Coordinator...")
	c.cancel()
	c.wg.Wait()
	if c.pipeline != nil {
		ctx, cancel := context.WithTimeout(context.Background(), stopCtxTimeout)
		defer cancel()
		c.pipeline.Stop(ctx)
		c.pipeline = nil
	}
}