From 95c9c1837cf466c2c9f423864c9901dc26e0e7c1 Mon Sep 17 00:00:00 2001 From: Pavel Zbitskiy <65323360+algorandskiy@users.noreply.github.com> Date: Thu, 13 Jun 2024 15:52:13 -0400 Subject: [PATCH] tests: preserve logs on LibGoalFixture failure (#6030) --- agreement/service.go | 3 ++ catchup/catchpointService.go | 3 ++ catchup/service.go | 3 ++ data/txHandler.go | 3 ++ netdeploy/network.go | 15 +++++--- network/wsNetwork.go | 3 ++ node/impls.go | 2 + node/node.go | 7 ++++ nodecontrol/algodControl.go | 9 +++++ rpcs/blockService.go | 3 ++ rpcs/ledgerService.go | 3 ++ rpcs/txSyncer.go | 3 ++ stateproof/worker.go | 3 ++ .../features/catchup/basicCatchup_test.go | 17 +++++++++ test/framework/fixtures/libgoalFixture.go | 37 ++++++++++++------- 15 files changed, 96 insertions(+), 18 deletions(-) diff --git a/agreement/service.go b/agreement/service.go index d8ec84b92f..8e38797492 100644 --- a/agreement/service.go +++ b/agreement/service.go @@ -173,6 +173,9 @@ func (s *Service) Start() { // // This method returns after all resources have been cleaned up. func (s *Service) Shutdown() { + s.log.Debug("agreement service is stopping") + defer s.log.Debug("agreement service has stopped") + close(s.quit) s.quitFn() <-s.done diff --git a/catchup/catchpointService.go b/catchup/catchpointService.go index 974f5964f1..efa34436a3 100644 --- a/catchup/catchpointService.go +++ b/catchup/catchpointService.go @@ -185,6 +185,9 @@ func (cs *CatchpointCatchupService) Abort() { // Stop stops the catchpoint catchup service - unlike Abort, this is not intended to abort the process but rather to allow // cleanup of in-memory resources for the purpose of clean shutdown. func (cs *CatchpointCatchupService) Stop() { + cs.log.Debug("catchpoint service is stopping") + defer cs.log.Debug("catchpoint service has stopped") + // signal the running goroutine that we want to stop cs.cancelCtxFunc() // wait for the running goroutine to terminate. diff --git a/catchup/service.go b/catchup/service.go index 5c6609b236..b1720c4fce 100644 --- a/catchup/service.go +++ b/catchup/service.go @@ -161,6 +161,9 @@ func (s *Service) Start() { // Stop informs the catchup service that it should stop, and waits for it to stop (when periodicSync() exits) func (s *Service) Stop() { + s.log.Debug("catchup service is stopping") + defer s.log.Debug("catchup service has stopped") + s.cancel() s.workers.Wait() if s.initialSyncNotified.CompareAndSwap(0, 1) { diff --git a/data/txHandler.go b/data/txHandler.go index 7851889378..871e71cbc3 100644 --- a/data/txHandler.go +++ b/data/txHandler.go @@ -254,6 +254,9 @@ func (handler *TxHandler) Start() { // Stop suspends the processing of incoming messages at the transaction handler func (handler *TxHandler) Stop() { + logging.Base().Debug("transaction handler is stopping") + defer logging.Base().Debug("transaction handler is stopping") + handler.ctxCancel() if handler.erl != nil { handler.erl.Stop() diff --git a/netdeploy/network.go b/netdeploy/network.go index b26c8ef5bc..6f31673a54 100644 --- a/netdeploy/network.go +++ b/netdeploy/network.go @@ -415,13 +415,14 @@ func (n Network) StartNode(binDir, nodeDir string, redirectOutput bool) (err err // Stop the network, ensuring primary relay stops first // No return code - we try to kill them if we can (if we read valid PID file) -func (n Network) Stop(binDir string) { - c := make(chan struct{}, len(n.cfg.RelayDirs)+len(n.nodeDirs)) +func (n Network) Stop(binDir string) (err error) { + c := make(chan error, len(n.cfg.RelayDirs)+len(n.nodeDirs)) stopNodeContoller := func(nc *nodecontrol.NodeController) { + var stopErr error defer func() { - c <- struct{}{} + c <- stopErr }() - nc.FullStop() + stopErr = nc.FullStop() } for _, relayDir := range n.cfg.RelayDirs { relayDataDir := n.getNodeFullPath(relayDir) @@ -439,9 +440,13 @@ func (n Network) Stop(binDir string) { } // wait until we finish stopping all the node controllers. for i := cap(c); i > 0; i-- { - <-c + stopErr := <-c + if stopErr != nil { + err = stopErr + } } close(c) + return err } // NetworkNodeStatus represents the result from checking the status of a particular node instance diff --git a/network/wsNetwork.go b/network/wsNetwork.go index a35375f517..6fc97def5e 100644 --- a/network/wsNetwork.go +++ b/network/wsNetwork.go @@ -793,6 +793,9 @@ func (wn *WebsocketNetwork) innerStop() { // Stop closes network connections and stops threads. // Stop blocks until all activity on this node is done. func (wn *WebsocketNetwork) Stop() { + wn.log.Debug("network is stopping") + defer wn.log.Debug("network has stopped") + wn.handler.ClearHandlers([]Tag{}) // if we have a working ticker, just stop it and clear it out. The access to this variable is safe since the Start()/Stop() are synced by the diff --git a/node/impls.go b/node/impls.go index 826f0399c4..3063e09659 100644 --- a/node/impls.go +++ b/node/impls.go @@ -44,6 +44,8 @@ func (i blockAuthenticatorImpl) Authenticate(block *bookkeeping.Block, cert *agr } func (i blockAuthenticatorImpl) Quit() { + logging.Base().Debug("block authenticator is stopping") + defer logging.Base().Debug("block authenticator has stopped") i.AsyncVoteVerifier.Quit() } diff --git a/node/node.go b/node/node.go index acf204facf..d1c6cc4b82 100644 --- a/node/node.go +++ b/node/node.go @@ -396,6 +396,8 @@ func (node *AlgorandFullNode) startMonitoringRoutines() { // waitMonitoringRoutines waits for all the monitoring routines to exit. Note that // the node.mu must not be taken, and that the node's context should have been canceled. func (node *AlgorandFullNode) waitMonitoringRoutines() { + node.log.Debug("waiting on node monitoring routines to exit") + defer node.log.Debug("done waiting on node monitoring routines to exit") node.monitoringRoutinesWaitGroup.Wait() } @@ -409,6 +411,9 @@ func (node *AlgorandFullNode) ListeningAddress() (string, bool) { // Stop stops running the node. Once a node is closed, it can never start again. func (node *AlgorandFullNode) Stop() { + node.log.Debug("algorand node is stopping") + defer node.log.Debug("algorand node has stopped") + node.mu.Lock() defer func() { node.mu.Unlock() @@ -431,9 +436,11 @@ func (node *AlgorandFullNode) Stop() { node.ledgerService.Stop() } node.catchupBlockAuth.Quit() + node.log.Debug("crypto worker pools are stopping") node.highPriorityCryptoVerificationPool.Shutdown() node.lowPriorityCryptoVerificationPool.Shutdown() node.cryptoPool.Shutdown() + node.log.Debug("crypto worker pools have stopped") node.cancelCtx() } diff --git a/nodecontrol/algodControl.go b/nodecontrol/algodControl.go index 74137a72eb..a6bcb5fb82 100644 --- a/nodecontrol/algodControl.go +++ b/nodecontrol/algodControl.go @@ -49,6 +49,14 @@ func (e *NodeNotRunningError) Error() string { return fmt.Sprintf("no running node in directory '%s'", e.algodDataDir) } +// NodeKilledError thrown when StopAlgod is called but the node was killed by SIGKILL instead of a clean shutdown with SIGTERM +type NodeKilledError struct { +} + +func (e *NodeKilledError) Error() string { + return "node was killed" +} + // MissingDataDirError thrown when StopAlgod is called but requested directory does not exist type MissingDataDirError struct { algodDataDir string @@ -176,6 +184,7 @@ func (nc *NodeController) StopAlgod() (err error) { if killed { // delete the pid file. os.Remove(nc.algodPidFile) + return &NodeKilledError{} } } else { return &NodeNotRunningError{algodDataDir: nc.algodDataDir} diff --git a/rpcs/blockService.go b/rpcs/blockService.go index 8231b5a98b..d1ef82dfd4 100644 --- a/rpcs/blockService.go +++ b/rpcs/blockService.go @@ -169,6 +169,9 @@ func (bs *BlockService) Start() { // Stop servicing catchup requests over ws func (bs *BlockService) Stop() { + bs.log.Debug("block service is stopping") + defer bs.log.Debug("block service has stopped") + bs.mu.Lock() close(bs.stop) bs.mu.Unlock() diff --git a/rpcs/ledgerService.go b/rpcs/ledgerService.go index b3742bb985..5f75a36b0c 100644 --- a/rpcs/ledgerService.go +++ b/rpcs/ledgerService.go @@ -96,6 +96,9 @@ func (ls *LedgerService) Start() { // Stop servicing catchup requests func (ls *LedgerService) Stop() { if ls.enableService { + logging.Base().Debug("ledger service is stopping") + defer logging.Base().Debug("ledger service has stopped") + ls.running.Store(0) ls.stopping.Wait() } diff --git a/rpcs/txSyncer.go b/rpcs/txSyncer.go index 1d7fc2c027..2d2a993d9a 100644 --- a/rpcs/txSyncer.go +++ b/rpcs/txSyncer.go @@ -97,6 +97,9 @@ func (syncer *TxSyncer) Start(canStart chan struct{}) { // Stop stops periodic syncing func (syncer *TxSyncer) Stop() { + syncer.log.Debug("transaction syncer is stopping") + defer syncer.log.Debug("transaction syncer has stopped") + syncer.cancel() syncer.wg.Wait() } diff --git a/stateproof/worker.go b/stateproof/worker.go index e73a06d137..f74e118f58 100644 --- a/stateproof/worker.go +++ b/stateproof/worker.go @@ -142,6 +142,9 @@ func (spw *Worker) initDb(inMemory bool) error { // Stop stops any goroutines associated with this worker. It is the caller responsibility to remove the register // network handlers func (spw *Worker) Stop() { + spw.log.Debug("stateproof worker is stopping") + defer spw.log.Debug("stateproof worker has stopped") + spw.shutdown() spw.wg.Wait() diff --git a/test/e2e-go/features/catchup/basicCatchup_test.go b/test/e2e-go/features/catchup/basicCatchup_test.go index 2e3ac87943..938313206d 100644 --- a/test/e2e-go/features/catchup/basicCatchup_test.go +++ b/test/e2e-go/features/catchup/basicCatchup_test.go @@ -128,6 +128,15 @@ func runCatchupOverGossip(t fixtures.TestingTB, a.NoError(err) a.Empty(cfg.NetworkProtocolVersion) cfg.NetworkProtocolVersion = ledgerNodeDowngradeTo + cfg.BaseLoggerDebugLevel = 5 // debug logging while debugging this test + cfg.SaveToDisk(dir) + } else { + // TODO: remove when TestCatchupOverGossip is fixed + dir, err := fixture.GetNodeDir("Node") + a.NoError(err) + cfg, err := config.LoadConfigFromDisk(dir) + a.NoError(err) + cfg.BaseLoggerDebugLevel = 5 // debug logging while debugging this test cfg.SaveToDisk(dir) } @@ -138,6 +147,14 @@ func runCatchupOverGossip(t fixtures.TestingTB, a.NoError(err) a.Empty(cfg.NetworkProtocolVersion) cfg.NetworkProtocolVersion = fetcherNodeDowngradeTo + cfg.BaseLoggerDebugLevel = 5 // debug logging while debugging this test + cfg.SaveToDisk(dir) + } else { + // TODO: remove when TestCatchupOverGossip is fixed + dir := fixture.PrimaryDataDir() + cfg, err := config.LoadConfigFromDisk(dir) + a.NoError(err) + cfg.BaseLoggerDebugLevel = 5 // debug logging while debugging this test cfg.SaveToDisk(dir) } diff --git a/test/framework/fixtures/libgoalFixture.go b/test/framework/fixtures/libgoalFixture.go index 40269aa111..de1a06623d 100644 --- a/test/framework/fixtures/libgoalFixture.go +++ b/test/framework/fixtures/libgoalFixture.go @@ -145,7 +145,7 @@ func (f *LibGoalFixture) nodeExitWithError(nc *nodecontrol.NodeController, err e } debugLog := func() { - fmt.Fprintf(os.Stderr, "Node at %s has terminated with an error: %v. Dumping logs...", nc.GetDataDir(), err) + fmt.Fprintf(os.Stderr, "Node at %s has terminated with an error: %v. Dumping logs...\n", nc.GetDataDir(), err) f.dumpLogs(filepath.Join(nc.GetDataDir(), "node.log")) } @@ -158,7 +158,7 @@ func (f *LibGoalFixture) nodeExitWithError(nc *nodecontrol.NodeController, err e ws := exitError.Sys().(syscall.WaitStatus) exitCode := ws.ExitStatus() - fmt.Fprintf(os.Stderr, "Node at %s has terminated with error code %d (%v)", nc.GetDataDir(), exitCode, *exitError) + fmt.Fprintf(os.Stderr, "Node at %s has terminated with error code %d (%v)\n", nc.GetDataDir(), exitCode, *exitError) debugLog() require.NoError(f.t, err) } @@ -345,7 +345,10 @@ func (f *LibGoalFixture) Shutdown() { func (f *LibGoalFixture) ShutdownImpl(preserveData bool) { f.NC.StopKMD() if preserveData { - f.network.Stop(f.binDir) + err := f.network.Stop(f.binDir) + if err != nil { + f.t.Logf("Fixture %s shutdown caught a network stop error: %v", f.Name, err) + } for _, relayDir := range f.RelayDataDirs() { f.dumpLogs(filepath.Join(relayDir, "node.log")) } @@ -353,14 +356,22 @@ func (f *LibGoalFixture) ShutdownImpl(preserveData bool) { f.dumpLogs(filepath.Join(nodeDir, "node.log")) } } else { - f.network.Delete(f.binDir) - - // Remove the test dir, if it was created by us as a temporary - // directory and it is empty. If there's anything still in the - // test dir, os.Remove()'s rmdir will fail and have no effect; - // we ignore this error. - if f.testDirTmp { - os.Remove(f.testDir) + err := f.network.Stop(f.binDir) + if err == nil { + // no error, proceed with cleanup + delErr := f.network.Delete(f.binDir) + if delErr != nil { + f.t.Logf("Fixture %s shutdown caught a network delete error: %v", f.Name, delErr) + } + // Remove the test dir, if it was created by us as a temporary + // directory and it is empty. If there's anything still in the + // test dir, os.Remove()'s rmdir will fail and have no effect; + // we ignore this error. + if f.testDirTmp { + os.Remove(f.testDir) + } + } else { + f.t.Logf("Fixture %s shutdown caught a network stop error: %v", f.Name, err) } } } @@ -369,14 +380,14 @@ func (f *LibGoalFixture) ShutdownImpl(preserveData bool) { func (f *LibGoalFixture) dumpLogs(filePath string) { file, err := os.Open(filePath) if err != nil { - fmt.Fprintf(os.Stderr, "could not open %s", filePath) + fmt.Fprintf(os.Stderr, "could not open %s\n", filePath) return } defer file.Close() fmt.Fprintf(os.Stderr, "=================================\n") parts := strings.Split(filePath, "/") - fmt.Fprintf(os.Stderr, "%s/%s:", parts[len(parts)-2], parts[len(parts)-1]) // Primary/node.log + fmt.Fprintf(os.Stderr, "%s/%s:\n", parts[len(parts)-2], parts[len(parts)-1]) // Primary/node.log scanner := bufio.NewScanner(file) for scanner.Scan() { fmt.Fprint(os.Stderr, scanner.Text())