Skip to content

Commit

Permalink
tests: preserve logs on LibGoalFixture failure (#6030)
Browse files Browse the repository at this point in the history
  • Loading branch information
algorandskiy authored Jun 13, 2024
1 parent 4e116cb commit 95c9c18
Show file tree
Hide file tree
Showing 15 changed files with 96 additions and 18 deletions.
3 changes: 3 additions & 0 deletions agreement/service.go
Original file line number Diff line number Diff line change
Expand Up @@ -173,6 +173,9 @@ func (s *Service) Start() {
//
// This method returns after all resources have been cleaned up.
func (s *Service) Shutdown() {
s.log.Debug("agreement service is stopping")
defer s.log.Debug("agreement service has stopped")

close(s.quit)
s.quitFn()
<-s.done
Expand Down
3 changes: 3 additions & 0 deletions catchup/catchpointService.go
Original file line number Diff line number Diff line change
Expand Up @@ -185,6 +185,9 @@ func (cs *CatchpointCatchupService) Abort() {
// Stop stops the catchpoint catchup service - unlike Abort, this is not intended to abort the process but rather to allow
// cleanup of in-memory resources for the purpose of clean shutdown.
func (cs *CatchpointCatchupService) Stop() {
cs.log.Debug("catchpoint service is stopping")
defer cs.log.Debug("catchpoint service has stopped")

// signal the running goroutine that we want to stop
cs.cancelCtxFunc()
// wait for the running goroutine to terminate.
Expand Down
3 changes: 3 additions & 0 deletions catchup/service.go
Original file line number Diff line number Diff line change
Expand Up @@ -161,6 +161,9 @@ func (s *Service) Start() {

// Stop informs the catchup service that it should stop, and waits for it to stop (when periodicSync() exits)
func (s *Service) Stop() {
s.log.Debug("catchup service is stopping")
defer s.log.Debug("catchup service has stopped")

s.cancel()
s.workers.Wait()
if s.initialSyncNotified.CompareAndSwap(0, 1) {
Expand Down
3 changes: 3 additions & 0 deletions data/txHandler.go
Original file line number Diff line number Diff line change
Expand Up @@ -254,6 +254,9 @@ func (handler *TxHandler) Start() {

// Stop suspends the processing of incoming messages at the transaction handler
func (handler *TxHandler) Stop() {
logging.Base().Debug("transaction handler is stopping")
defer logging.Base().Debug("transaction handler is stopping")

handler.ctxCancel()
if handler.erl != nil {
handler.erl.Stop()
Expand Down
15 changes: 10 additions & 5 deletions netdeploy/network.go
Original file line number Diff line number Diff line change
Expand Up @@ -415,13 +415,14 @@ func (n Network) StartNode(binDir, nodeDir string, redirectOutput bool) (err err

// Stop the network, ensuring primary relay stops first
// No return code - we try to kill them if we can (if we read valid PID file)
func (n Network) Stop(binDir string) {
c := make(chan struct{}, len(n.cfg.RelayDirs)+len(n.nodeDirs))
func (n Network) Stop(binDir string) (err error) {
c := make(chan error, len(n.cfg.RelayDirs)+len(n.nodeDirs))
stopNodeContoller := func(nc *nodecontrol.NodeController) {
var stopErr error
defer func() {
c <- struct{}{}
c <- stopErr
}()
nc.FullStop()
stopErr = nc.FullStop()
}
for _, relayDir := range n.cfg.RelayDirs {
relayDataDir := n.getNodeFullPath(relayDir)
Expand All @@ -439,9 +440,13 @@ func (n Network) Stop(binDir string) {
}
// wait until we finish stopping all the node controllers.
for i := cap(c); i > 0; i-- {
<-c
stopErr := <-c
if stopErr != nil {
err = stopErr
}
}
close(c)
return err
}

// NetworkNodeStatus represents the result from checking the status of a particular node instance
Expand Down
3 changes: 3 additions & 0 deletions network/wsNetwork.go
Original file line number Diff line number Diff line change
Expand Up @@ -793,6 +793,9 @@ func (wn *WebsocketNetwork) innerStop() {
// Stop closes network connections and stops threads.
// Stop blocks until all activity on this node is done.
func (wn *WebsocketNetwork) Stop() {
wn.log.Debug("network is stopping")
defer wn.log.Debug("network has stopped")

wn.handler.ClearHandlers([]Tag{})

// if we have a working ticker, just stop it and clear it out. The access to this variable is safe since the Start()/Stop() are synced by the
Expand Down
2 changes: 2 additions & 0 deletions node/impls.go
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,8 @@ func (i blockAuthenticatorImpl) Authenticate(block *bookkeeping.Block, cert *agr
}

func (i blockAuthenticatorImpl) Quit() {
logging.Base().Debug("block authenticator is stopping")
defer logging.Base().Debug("block authenticator has stopped")
i.AsyncVoteVerifier.Quit()
}

Expand Down
7 changes: 7 additions & 0 deletions node/node.go
Original file line number Diff line number Diff line change
Expand Up @@ -396,6 +396,8 @@ func (node *AlgorandFullNode) startMonitoringRoutines() {
// waitMonitoringRoutines waits for all the monitoring routines to exit. Note that
// the node.mu must not be taken, and that the node's context should have been canceled.
func (node *AlgorandFullNode) waitMonitoringRoutines() {
node.log.Debug("waiting on node monitoring routines to exit")
defer node.log.Debug("done waiting on node monitoring routines to exit")
node.monitoringRoutinesWaitGroup.Wait()
}

Expand All @@ -409,6 +411,9 @@ func (node *AlgorandFullNode) ListeningAddress() (string, bool) {

// Stop stops running the node. Once a node is closed, it can never start again.
func (node *AlgorandFullNode) Stop() {
node.log.Debug("algorand node is stopping")
defer node.log.Debug("algorand node has stopped")

node.mu.Lock()
defer func() {
node.mu.Unlock()
Expand All @@ -431,9 +436,11 @@ func (node *AlgorandFullNode) Stop() {
node.ledgerService.Stop()
}
node.catchupBlockAuth.Quit()
node.log.Debug("crypto worker pools are stopping")
node.highPriorityCryptoVerificationPool.Shutdown()
node.lowPriorityCryptoVerificationPool.Shutdown()
node.cryptoPool.Shutdown()
node.log.Debug("crypto worker pools have stopped")
node.cancelCtx()
}

Expand Down
9 changes: 9 additions & 0 deletions nodecontrol/algodControl.go
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,14 @@ func (e *NodeNotRunningError) Error() string {
return fmt.Sprintf("no running node in directory '%s'", e.algodDataDir)
}

// NodeKilledError thrown when StopAlgod is called but the node was killed by SIGKILL instead of a clean shutdown with SIGTERM
type NodeKilledError struct {
}

func (e *NodeKilledError) Error() string {
return "node was killed"
}

// MissingDataDirError thrown when StopAlgod is called but requested directory does not exist
type MissingDataDirError struct {
algodDataDir string
Expand Down Expand Up @@ -176,6 +184,7 @@ func (nc *NodeController) StopAlgod() (err error) {
if killed {
// delete the pid file.
os.Remove(nc.algodPidFile)
return &NodeKilledError{}
}
} else {
return &NodeNotRunningError{algodDataDir: nc.algodDataDir}
Expand Down
3 changes: 3 additions & 0 deletions rpcs/blockService.go
Original file line number Diff line number Diff line change
Expand Up @@ -169,6 +169,9 @@ func (bs *BlockService) Start() {

// Stop servicing catchup requests over ws
func (bs *BlockService) Stop() {
bs.log.Debug("block service is stopping")
defer bs.log.Debug("block service has stopped")

bs.mu.Lock()
close(bs.stop)
bs.mu.Unlock()
Expand Down
3 changes: 3 additions & 0 deletions rpcs/ledgerService.go
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,9 @@ func (ls *LedgerService) Start() {
// Stop servicing catchup requests
func (ls *LedgerService) Stop() {
if ls.enableService {
logging.Base().Debug("ledger service is stopping")
defer logging.Base().Debug("ledger service has stopped")

ls.running.Store(0)
ls.stopping.Wait()
}
Expand Down
3 changes: 3 additions & 0 deletions rpcs/txSyncer.go
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,9 @@ func (syncer *TxSyncer) Start(canStart chan struct{}) {

// Stop stops periodic syncing
func (syncer *TxSyncer) Stop() {
syncer.log.Debug("transaction syncer is stopping")
defer syncer.log.Debug("transaction syncer has stopped")

syncer.cancel()
syncer.wg.Wait()
}
Expand Down
3 changes: 3 additions & 0 deletions stateproof/worker.go
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,9 @@ func (spw *Worker) initDb(inMemory bool) error {
// Stop stops any goroutines associated with this worker. It is the caller responsibility to remove the register
// network handlers
func (spw *Worker) Stop() {
spw.log.Debug("stateproof worker is stopping")
defer spw.log.Debug("stateproof worker has stopped")

spw.shutdown()
spw.wg.Wait()

Expand Down
17 changes: 17 additions & 0 deletions test/e2e-go/features/catchup/basicCatchup_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,15 @@ func runCatchupOverGossip(t fixtures.TestingTB,
a.NoError(err)
a.Empty(cfg.NetworkProtocolVersion)
cfg.NetworkProtocolVersion = ledgerNodeDowngradeTo
cfg.BaseLoggerDebugLevel = 5 // debug logging while debugging this test
cfg.SaveToDisk(dir)
} else {
// TODO: remove when TestCatchupOverGossip is fixed
dir, err := fixture.GetNodeDir("Node")
a.NoError(err)
cfg, err := config.LoadConfigFromDisk(dir)
a.NoError(err)
cfg.BaseLoggerDebugLevel = 5 // debug logging while debugging this test
cfg.SaveToDisk(dir)
}

Expand All @@ -138,6 +147,14 @@ func runCatchupOverGossip(t fixtures.TestingTB,
a.NoError(err)
a.Empty(cfg.NetworkProtocolVersion)
cfg.NetworkProtocolVersion = fetcherNodeDowngradeTo
cfg.BaseLoggerDebugLevel = 5 // debug logging while debugging this test
cfg.SaveToDisk(dir)
} else {
// TODO: remove when TestCatchupOverGossip is fixed
dir := fixture.PrimaryDataDir()
cfg, err := config.LoadConfigFromDisk(dir)
a.NoError(err)
cfg.BaseLoggerDebugLevel = 5 // debug logging while debugging this test
cfg.SaveToDisk(dir)
}

Expand Down
37 changes: 24 additions & 13 deletions test/framework/fixtures/libgoalFixture.go
Original file line number Diff line number Diff line change
Expand Up @@ -145,7 +145,7 @@ func (f *LibGoalFixture) nodeExitWithError(nc *nodecontrol.NodeController, err e
}

debugLog := func() {
fmt.Fprintf(os.Stderr, "Node at %s has terminated with an error: %v. Dumping logs...", nc.GetDataDir(), err)
fmt.Fprintf(os.Stderr, "Node at %s has terminated with an error: %v. Dumping logs...\n", nc.GetDataDir(), err)
f.dumpLogs(filepath.Join(nc.GetDataDir(), "node.log"))
}

Expand All @@ -158,7 +158,7 @@ func (f *LibGoalFixture) nodeExitWithError(nc *nodecontrol.NodeController, err e
ws := exitError.Sys().(syscall.WaitStatus)
exitCode := ws.ExitStatus()

fmt.Fprintf(os.Stderr, "Node at %s has terminated with error code %d (%v)", nc.GetDataDir(), exitCode, *exitError)
fmt.Fprintf(os.Stderr, "Node at %s has terminated with error code %d (%v)\n", nc.GetDataDir(), exitCode, *exitError)
debugLog()
require.NoError(f.t, err)
}
Expand Down Expand Up @@ -345,22 +345,33 @@ func (f *LibGoalFixture) Shutdown() {
func (f *LibGoalFixture) ShutdownImpl(preserveData bool) {
f.NC.StopKMD()
if preserveData {
f.network.Stop(f.binDir)
err := f.network.Stop(f.binDir)
if err != nil {
f.t.Logf("Fixture %s shutdown caught a network stop error: %v", f.Name, err)
}
for _, relayDir := range f.RelayDataDirs() {
f.dumpLogs(filepath.Join(relayDir, "node.log"))
}
for _, nodeDir := range f.NodeDataDirs() {
f.dumpLogs(filepath.Join(nodeDir, "node.log"))
}
} else {
f.network.Delete(f.binDir)

// Remove the test dir, if it was created by us as a temporary
// directory and it is empty. If there's anything still in the
// test dir, os.Remove()'s rmdir will fail and have no effect;
// we ignore this error.
if f.testDirTmp {
os.Remove(f.testDir)
err := f.network.Stop(f.binDir)
if err == nil {
// no error, proceed with cleanup
delErr := f.network.Delete(f.binDir)
if delErr != nil {
f.t.Logf("Fixture %s shutdown caught a network delete error: %v", f.Name, delErr)
}
// Remove the test dir, if it was created by us as a temporary
// directory and it is empty. If there's anything still in the
// test dir, os.Remove()'s rmdir will fail and have no effect;
// we ignore this error.
if f.testDirTmp {
os.Remove(f.testDir)
}
} else {
f.t.Logf("Fixture %s shutdown caught a network stop error: %v", f.Name, err)
}
}
}
Expand All @@ -369,14 +380,14 @@ func (f *LibGoalFixture) ShutdownImpl(preserveData bool) {
func (f *LibGoalFixture) dumpLogs(filePath string) {
file, err := os.Open(filePath)
if err != nil {
fmt.Fprintf(os.Stderr, "could not open %s", filePath)
fmt.Fprintf(os.Stderr, "could not open %s\n", filePath)
return
}
defer file.Close()

fmt.Fprintf(os.Stderr, "=================================\n")
parts := strings.Split(filePath, "/")
fmt.Fprintf(os.Stderr, "%s/%s:", parts[len(parts)-2], parts[len(parts)-1]) // Primary/node.log
fmt.Fprintf(os.Stderr, "%s/%s:\n", parts[len(parts)-2], parts[len(parts)-1]) // Primary/node.log
scanner := bufio.NewScanner(file)
for scanner.Scan() {
fmt.Fprint(os.Stderr, scanner.Text())
Expand Down

0 comments on commit 95c9c18

Please sign in to comment.