You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 

600 lines
19 KiB

package coordinator
import (
"context"
"database/sql"
"fmt"
"math/big"
"sync"
"time"
"github.com/hermeznetwork/hermez-node/batchbuilder"
"github.com/hermeznetwork/hermez-node/common"
"github.com/hermeznetwork/hermez-node/db/historydb"
"github.com/hermeznetwork/hermez-node/db/l2db"
"github.com/hermeznetwork/hermez-node/eth"
"github.com/hermeznetwork/hermez-node/log"
"github.com/hermeznetwork/hermez-node/prover"
"github.com/hermeznetwork/hermez-node/synchronizer"
"github.com/hermeznetwork/hermez-node/txselector"
"github.com/hermeznetwork/tracerr"
)
type statsVars struct {
Stats synchronizer.Stats
Vars synchronizer.SCVariablesPtr
}
type state struct {
batchNum common.BatchNum
lastScheduledL1BatchBlockNum int64
lastForgeL1TxsNum int64
lastSlotForged int64
}
// Pipeline manages the forging of batches with parallel server proofs
type Pipeline struct {
num int
cfg Config
consts synchronizer.SCConsts
// state
state state
started bool
rw sync.RWMutex
errAtBatchNum common.BatchNum
lastForgeTime time.Time
proversPool *ProversPool
provers []prover.Client
coord *Coordinator
txManager *TxManager
historyDB *historydb.HistoryDB
l2DB *l2db.L2DB
txSelector *txselector.TxSelector
batchBuilder *batchbuilder.BatchBuilder
mutexL2DBUpdateDelete *sync.Mutex
purger *Purger
stats synchronizer.Stats
vars synchronizer.SCVariables
statsVarsCh chan statsVars
ctx context.Context
wg sync.WaitGroup
cancel context.CancelFunc
}
func (p *Pipeline) setErrAtBatchNum(batchNum common.BatchNum) {
p.rw.Lock()
defer p.rw.Unlock()
p.errAtBatchNum = batchNum
}
func (p *Pipeline) getErrAtBatchNum() common.BatchNum {
p.rw.RLock()
defer p.rw.RUnlock()
return p.errAtBatchNum
}
// NewPipeline creates a new Pipeline
func NewPipeline(ctx context.Context,
cfg Config,
num int, // Pipeline sequential number
historyDB *historydb.HistoryDB,
l2DB *l2db.L2DB,
txSelector *txselector.TxSelector,
batchBuilder *batchbuilder.BatchBuilder,
mutexL2DBUpdateDelete *sync.Mutex,
purger *Purger,
coord *Coordinator,
txManager *TxManager,
provers []prover.Client,
scConsts *synchronizer.SCConsts,
) (*Pipeline, error) {
proversPool := NewProversPool(len(provers))
proversPoolSize := 0
for _, prover := range provers {
if err := prover.WaitReady(ctx); err != nil {
log.Errorw("prover.WaitReady", "err", err)
} else {
proversPool.Add(ctx, prover)
proversPoolSize++
}
}
if proversPoolSize == 0 {
return nil, tracerr.Wrap(fmt.Errorf("no provers in the pool"))
}
return &Pipeline{
num: num,
cfg: cfg,
historyDB: historyDB,
l2DB: l2DB,
txSelector: txSelector,
batchBuilder: batchBuilder,
provers: provers,
proversPool: proversPool,
mutexL2DBUpdateDelete: mutexL2DBUpdateDelete,
purger: purger,
coord: coord,
txManager: txManager,
consts: *scConsts,
statsVarsCh: make(chan statsVars, queueLen),
}, nil
}
// SetSyncStatsVars is a thread safe method to sets the synchronizer Stats
func (p *Pipeline) SetSyncStatsVars(ctx context.Context, stats *synchronizer.Stats,
vars *synchronizer.SCVariablesPtr) {
select {
case p.statsVarsCh <- statsVars{Stats: *stats, Vars: *vars}:
case <-ctx.Done():
}
}
// reset pipeline state
func (p *Pipeline) reset(batchNum common.BatchNum,
stats *synchronizer.Stats, vars *synchronizer.SCVariables) error {
p.state = state{
batchNum: batchNum,
lastForgeL1TxsNum: stats.Sync.LastForgeL1TxsNum,
lastScheduledL1BatchBlockNum: 0,
lastSlotForged: -1,
}
p.stats = *stats
p.vars = *vars
// Reset the StateDB in TxSelector and BatchBuilder from the
// synchronizer only if the checkpoint we reset from either:
// a. Doesn't exist in the TxSelector/BatchBuilder
// b. The batch has already been synced by the synchronizer and has a
// different MTRoot than the BatchBuilder
// Otherwise, reset from the local checkpoint.
// First attempt to reset from local checkpoint if such checkpoint exists
existsTxSelector, err := p.txSelector.LocalAccountsDB().CheckpointExists(p.state.batchNum)
if err != nil {
return tracerr.Wrap(err)
}
fromSynchronizerTxSelector := !existsTxSelector
if err := p.txSelector.Reset(p.state.batchNum, fromSynchronizerTxSelector); err != nil {
return tracerr.Wrap(err)
}
existsBatchBuilder, err := p.batchBuilder.LocalStateDB().CheckpointExists(p.state.batchNum)
if err != nil {
return tracerr.Wrap(err)
}
fromSynchronizerBatchBuilder := !existsBatchBuilder
if err := p.batchBuilder.Reset(p.state.batchNum, fromSynchronizerBatchBuilder); err != nil {
return tracerr.Wrap(err)
}
// After reset, check that if the batch exists in the historyDB, the
// stateRoot matches with the local one, if not, force a reset from
// synchronizer
batch, err := p.historyDB.GetBatch(p.state.batchNum)
if tracerr.Unwrap(err) == sql.ErrNoRows {
// nothing to do
} else if err != nil {
return tracerr.Wrap(err)
} else {
localStateRoot := p.batchBuilder.LocalStateDB().MT.Root().BigInt()
if batch.StateRoot.Cmp(localStateRoot) != 0 {
log.Debugw("localStateRoot (%v) != historyDB stateRoot (%v). "+
"Forcing reset from Synchronizer", localStateRoot, batch.StateRoot)
// StateRoot from synchronizer doesn't match StateRoot
// from batchBuilder, force a reset from synchronizer
if err := p.txSelector.Reset(p.state.batchNum, true); err != nil {
return tracerr.Wrap(err)
}
if err := p.batchBuilder.Reset(p.state.batchNum, true); err != nil {
return tracerr.Wrap(err)
}
}
}
return nil
}
func (p *Pipeline) syncSCVars(vars synchronizer.SCVariablesPtr) {
updateSCVars(&p.vars, vars)
}
// handleForgeBatch waits for an available proof server, calls p.forgeBatch to
// forge the batch and get the zkInputs, and then sends the zkInputs to the
// selected proof server so that the proof computation begins.
func (p *Pipeline) handleForgeBatch(ctx context.Context,
batchNum common.BatchNum) (batchInfo *BatchInfo, err error) {
// 1. Wait for an available serverProof (blocking call)
serverProof, err := p.proversPool.Get(ctx)
if ctx.Err() != nil {
return nil, ctx.Err()
} else if err != nil {
log.Errorw("proversPool.Get", "err", err)
return nil, tracerr.Wrap(err)
}
defer func() {
// If we encounter any error (notice that this function returns
// errors to notify that a batch is not forged not only because
// of unexpected errors but also due to benign causes), add the
// serverProof back to the pool
if err != nil {
p.proversPool.Add(ctx, serverProof)
}
}()
// 2. Forge the batch internally (make a selection of txs and prepare
// all the smart contract arguments)
p.mutexL2DBUpdateDelete.Lock()
batchInfo, err = p.forgeBatch(batchNum)
p.mutexL2DBUpdateDelete.Unlock()
if ctx.Err() != nil {
return nil, ctx.Err()
} else if err != nil {
if tracerr.Unwrap(err) == errLastL1BatchNotSynced {
log.Warnw("forgeBatch: scheduled L1Batch too early", "err", err,
"lastForgeL1TxsNum", p.state.lastForgeL1TxsNum,
"syncLastForgeL1TxsNum", p.stats.Sync.LastForgeL1TxsNum)
} else if tracerr.Unwrap(err) == errForgeNoTxsBeforeDelay ||
tracerr.Unwrap(err) == errForgeBeforeDelay {
// no log
} else {
log.Errorw("forgeBatch", "err", err)
}
return nil, tracerr.Wrap(err)
}
// 3. Send the ZKInputs to the proof server
batchInfo.ServerProof = serverProof
if err := p.sendServerProof(ctx, batchInfo); ctx.Err() != nil {
return nil, ctx.Err()
} else if err != nil {
log.Errorw("sendServerProof", "err", err)
return nil, tracerr.Wrap(err)
}
return batchInfo, nil
}
// Start the forging pipeline
func (p *Pipeline) Start(batchNum common.BatchNum,
stats *synchronizer.Stats, vars *synchronizer.SCVariables) error {
if p.started {
log.Fatal("Pipeline already started")
}
p.started = true
if err := p.reset(batchNum, stats, vars); err != nil {
return tracerr.Wrap(err)
}
p.ctx, p.cancel = context.WithCancel(context.Background())
queueSize := 1
batchChSentServerProof := make(chan *BatchInfo, queueSize)
p.wg.Add(1)
go func() {
timer := time.NewTimer(zeroDuration)
for {
select {
case <-p.ctx.Done():
log.Info("Pipeline forgeBatch loop done")
p.wg.Done()
return
case statsVars := <-p.statsVarsCh:
p.stats = statsVars.Stats
p.syncSCVars(statsVars.Vars)
case <-timer.C:
timer.Reset(p.cfg.ForgeRetryInterval)
// Once errAtBatchNum != 0, we stop forging
// batches because there's been an error and we
// wait for the pipeline to be stopped.
if p.getErrAtBatchNum() != 0 {
continue
}
batchNum = p.state.batchNum + 1
batchInfo, err := p.handleForgeBatch(p.ctx, batchNum)
if p.ctx.Err() != nil {
continue
} else if tracerr.Unwrap(err) == errLastL1BatchNotSynced ||
tracerr.Unwrap(err) == errForgeNoTxsBeforeDelay ||
tracerr.Unwrap(err) == errForgeBeforeDelay {
continue
} else if err != nil {
p.setErrAtBatchNum(batchNum)
p.coord.SendMsg(p.ctx, MsgStopPipeline{
Reason: fmt.Sprintf(
"Pipeline.handleForgBatch: %v", err),
FailedBatchNum: batchNum,
})
continue
}
p.lastForgeTime = time.Now()
p.state.batchNum = batchNum
select {
case batchChSentServerProof <- batchInfo:
case <-p.ctx.Done():
}
if !timer.Stop() {
<-timer.C
}
timer.Reset(zeroDuration)
}
}
}()
p.wg.Add(1)
go func() {
for {
select {
case <-p.ctx.Done():
log.Info("Pipeline waitServerProofSendEth loop done")
p.wg.Done()
return
case batchInfo := <-batchChSentServerProof:
// Once errAtBatchNum != 0, we stop forging
// batches because there's been an error and we
// wait for the pipeline to be stopped.
if p.getErrAtBatchNum() != 0 {
continue
}
err := p.waitServerProof(p.ctx, batchInfo)
if p.ctx.Err() != nil {
continue
} else if err != nil {
log.Errorw("waitServerProof", "err", err)
p.setErrAtBatchNum(batchInfo.BatchNum)
p.coord.SendMsg(p.ctx, MsgStopPipeline{
Reason: fmt.Sprintf(
"Pipeline.waitServerProof: %v", err),
FailedBatchNum: batchInfo.BatchNum,
})
continue
}
// We are done with this serverProof, add it back to the pool
p.proversPool.Add(p.ctx, batchInfo.ServerProof)
p.txManager.AddBatch(p.ctx, batchInfo)
}
}
}()
return nil
}
// Stop the forging pipeline
func (p *Pipeline) Stop(ctx context.Context) {
if !p.started {
log.Fatal("Pipeline already stopped")
}
p.started = false
log.Info("Stopping Pipeline...")
p.cancel()
p.wg.Wait()
for _, prover := range p.provers {
if err := prover.Cancel(ctx); ctx.Err() != nil {
continue
} else if err != nil {
log.Errorw("prover.Cancel", "err", err)
}
}
}
// sendServerProof sends the circuit inputs to the proof server
func (p *Pipeline) sendServerProof(ctx context.Context, batchInfo *BatchInfo) error {
p.cfg.debugBatchStore(batchInfo)
// 7. Call the selected idle server proof with BatchBuilder output,
// save server proof info for batchNum
if err := batchInfo.ServerProof.CalculateProof(ctx, batchInfo.ZKInputs); err != nil {
return tracerr.Wrap(err)
}
return nil
}
// forgeBatch forges the batchNum batch.
func (p *Pipeline) forgeBatch(batchNum common.BatchNum) (batchInfo *BatchInfo, err error) {
// remove transactions from the pool that have been there for too long
_, err = p.purger.InvalidateMaybe(p.l2DB, p.txSelector.LocalAccountsDB(),
p.stats.Sync.LastBlock.Num, int64(batchNum))
if err != nil {
return nil, tracerr.Wrap(err)
}
_, err = p.purger.PurgeMaybe(p.l2DB, p.stats.Sync.LastBlock.Num, int64(batchNum))
if err != nil {
return nil, tracerr.Wrap(err)
}
// Structure to accumulate data and metadata of the batch
now := time.Now()
batchInfo = &BatchInfo{PipelineNum: p.num, BatchNum: batchNum}
batchInfo.Debug.StartTimestamp = now
batchInfo.Debug.StartBlockNum = p.stats.Eth.LastBlock.Num + 1
var poolL2Txs []common.PoolL2Tx
var discardedL2Txs []common.PoolL2Tx
var l1UserTxsExtra, l1CoordTxs []common.L1Tx
var auths [][]byte
var coordIdxs []common.Idx
// Check if the slot is not yet fulfilled
slotCommitted := false
if p.stats.Sync.Auction.CurrentSlot.ForgerCommitment ||
p.stats.Sync.Auction.CurrentSlot.SlotNum == p.state.lastSlotForged {
slotCommitted = true
}
// If we haven't reached the ForgeDelay, skip forging the batch
if slotCommitted && now.Sub(p.lastForgeTime) < p.cfg.ForgeDelay {
return nil, tracerr.Wrap(errForgeBeforeDelay)
}
// 1. Decide if we forge L2Tx or L1+L2Tx
if p.shouldL1L2Batch(batchInfo) {
batchInfo.L1Batch = true
if p.state.lastForgeL1TxsNum != p.stats.Sync.LastForgeL1TxsNum {
return nil, tracerr.Wrap(errLastL1BatchNotSynced)
}
// 2a: L1+L2 txs
l1UserTxs, err := p.historyDB.GetUnforgedL1UserTxs(p.state.lastForgeL1TxsNum + 1)
if err != nil {
return nil, tracerr.Wrap(err)
}
coordIdxs, auths, l1UserTxsExtra, l1CoordTxs, poolL2Txs, discardedL2Txs, err =
p.txSelector.GetL1L2TxSelection(p.cfg.TxProcessorConfig, l1UserTxs)
if err != nil {
return nil, tracerr.Wrap(err)
}
} else {
// 2b: only L2 txs
coordIdxs, auths, l1CoordTxs, poolL2Txs, discardedL2Txs, err =
p.txSelector.GetL2TxSelection(p.cfg.TxProcessorConfig)
if err != nil {
return nil, tracerr.Wrap(err)
}
l1UserTxsExtra = nil
}
// If there are no txs to forge, no l1UserTxs in the open queue to
// freeze, and we haven't reached the ForgeNoTxsDelay, skip forging the
// batch.
if slotCommitted && now.Sub(p.lastForgeTime) < p.cfg.ForgeNoTxsDelay {
noTxs := false
if len(l1UserTxsExtra) == 0 && len(l1CoordTxs) == 0 && len(poolL2Txs) == 0 {
if batchInfo.L1Batch {
// Query the number of unforged L1UserTxs
// (either in a open queue or in a frozen
// not-yet-forged queue).
count, err := p.historyDB.GetUnforgedL1UserTxsCount()
if err != nil {
return nil, tracerr.Wrap(err)
}
// If there are future L1UserTxs, we forge a
// batch to advance the queues to be able to
// forge the L1UserTxs in the future.
// Otherwise, skip.
if count == 0 {
noTxs = true
}
} else {
noTxs = true
}
}
if noTxs {
if err := p.txSelector.Reset(batchInfo.BatchNum-1, false); err != nil {
return nil, tracerr.Wrap(err)
}
return nil, tracerr.Wrap(errForgeNoTxsBeforeDelay)
}
}
if batchInfo.L1Batch {
p.state.lastScheduledL1BatchBlockNum = p.stats.Eth.LastBlock.Num + 1
p.state.lastForgeL1TxsNum++
}
// 3. Save metadata from TxSelector output for BatchNum
batchInfo.L1UserTxsExtra = l1UserTxsExtra
batchInfo.L1CoordTxs = l1CoordTxs
batchInfo.L1CoordinatorTxsAuths = auths
batchInfo.CoordIdxs = coordIdxs
batchInfo.VerifierIdx = p.cfg.VerifierIdx
if err := p.l2DB.StartForging(common.TxIDsFromPoolL2Txs(poolL2Txs),
batchInfo.BatchNum); err != nil {
return nil, tracerr.Wrap(err)
}
if err := p.l2DB.UpdateTxsInfo(discardedL2Txs); err != nil {
return nil, tracerr.Wrap(err)
}
// Invalidate transactions that become invalid because of
// the poolL2Txs selected. Will mark as invalid the txs that have a
// (fromIdx, nonce) which already appears in the selected txs (includes
// all the nonces smaller than the current one)
err = p.l2DB.InvalidateOldNonces(idxsNonceFromPoolL2Txs(poolL2Txs), batchInfo.BatchNum)
if err != nil {
return nil, tracerr.Wrap(err)
}
// 4. Call BatchBuilder with TxSelector output
configBatch := &batchbuilder.ConfigBatch{
TxProcessorConfig: p.cfg.TxProcessorConfig,
}
zkInputs, err := p.batchBuilder.BuildBatch(coordIdxs, configBatch, l1UserTxsExtra,
l1CoordTxs, poolL2Txs)
if err != nil {
return nil, tracerr.Wrap(err)
}
l2Txs, err := common.PoolL2TxsToL2Txs(poolL2Txs) // NOTE: This is a big uggly, find a better way
if err != nil {
return nil, tracerr.Wrap(err)
}
batchInfo.L2Txs = l2Txs
// 5. Save metadata from BatchBuilder output for BatchNum
batchInfo.ZKInputs = zkInputs
batchInfo.Debug.Status = StatusForged
p.cfg.debugBatchStore(batchInfo)
log.Infow("Pipeline: batch forged internally", "batch", batchInfo.BatchNum)
p.state.lastSlotForged = p.stats.Sync.Auction.CurrentSlot.SlotNum
return batchInfo, nil
}
// waitServerProof gets the generated zkProof & sends it to the SmartContract
func (p *Pipeline) waitServerProof(ctx context.Context, batchInfo *BatchInfo) error {
proof, pubInputs, err := batchInfo.ServerProof.GetProof(ctx) // blocking call,
// until not resolved don't continue. Returns when the proof server has calculated the proof
if err != nil {
return tracerr.Wrap(err)
}
batchInfo.Proof = proof
batchInfo.PublicInputs = pubInputs
batchInfo.ForgeBatchArgs = prepareForgeBatchArgs(batchInfo)
batchInfo.Debug.Status = StatusProof
p.cfg.debugBatchStore(batchInfo)
log.Infow("Pipeline: batch proof calculated", "batch", batchInfo.BatchNum)
return nil
}
func (p *Pipeline) shouldL1L2Batch(batchInfo *BatchInfo) bool {
// Take the lastL1BatchBlockNum as the biggest between the last
// scheduled one, and the synchronized one.
lastL1BatchBlockNum := p.state.lastScheduledL1BatchBlockNum
if p.stats.Sync.LastL1BatchBlock > lastL1BatchBlockNum {
lastL1BatchBlockNum = p.stats.Sync.LastL1BatchBlock
}
// Set Debug information
batchInfo.Debug.LastScheduledL1BatchBlockNum = p.state.lastScheduledL1BatchBlockNum
batchInfo.Debug.LastL1BatchBlock = p.stats.Sync.LastL1BatchBlock
batchInfo.Debug.LastL1BatchBlockDelta = p.stats.Eth.LastBlock.Num + 1 - lastL1BatchBlockNum
batchInfo.Debug.L1BatchBlockScheduleDeadline =
int64(float64(p.vars.Rollup.ForgeL1L2BatchTimeout-1) * p.cfg.L1BatchTimeoutPerc)
// Return true if we have passed the l1BatchTimeoutPerc portion of the
// range before the l1batch timeout.
return p.stats.Eth.LastBlock.Num+1-lastL1BatchBlockNum >=
int64(float64(p.vars.Rollup.ForgeL1L2BatchTimeout-1)*p.cfg.L1BatchTimeoutPerc)
}
func prepareForgeBatchArgs(batchInfo *BatchInfo) *eth.RollupForgeBatchArgs {
proof := batchInfo.Proof
zki := batchInfo.ZKInputs
return &eth.RollupForgeBatchArgs{
NewLastIdx: int64(zki.Metadata.NewLastIdxRaw),
NewStRoot: zki.Metadata.NewStateRootRaw.BigInt(),
NewExitRoot: zki.Metadata.NewExitRootRaw.BigInt(),
L1UserTxs: batchInfo.L1UserTxsExtra,
L1CoordinatorTxs: batchInfo.L1CoordTxs,
L1CoordinatorTxsAuths: batchInfo.L1CoordinatorTxsAuths,
L2TxsData: batchInfo.L2Txs,
FeeIdxCoordinator: batchInfo.CoordIdxs,
// Circuit selector
VerifierIdx: batchInfo.VerifierIdx,
L1Batch: batchInfo.L1Batch,
ProofA: [2]*big.Int{proof.PiA[0], proof.PiA[1]},
// Implementation of the verifier need a swap on the proofB vector
ProofB: [2][2]*big.Int{
{proof.PiB[0][1], proof.PiB[0][0]},
{proof.PiB[1][1], proof.PiB[1][0]},
},
ProofC: [2]*big.Int{proof.PiC[0], proof.PiC[1]},
}
}