From c3cb594050df6822c4a71da50848bb63854ee845 Mon Sep 17 00:00:00 2001 From: Tiexin Guo Date: Mon, 14 Oct 2024 11:16:15 +0800 Subject: [PATCH] chore: revert changes introduced by allowing stopping services in starting state (#510) In [this PR](https://github.com/canonical/pebble/pull/503), we introduced a feature to allow stopping services that are in the starting state / within the okayDelay. However, this makes [a deadlock issue in ringbuffer a real issue](https://github.com/canonical/pebble/issues/508). So we are reverting this change now and will redo it after the deadlock is resolved. --- internals/daemon/daemon_test.go | 64 +---- internals/overlord/servstate/handlers.go | 4 - internals/overlord/servstate/manager.go | 4 +- internals/overlord/servstate/manager_test.go | 52 ---- .../overlord/servstate/state-diagram.dot | 1 - .../overlord/servstate/state-diagram.svg | 248 +++++++++--------- tests/run_test.go | 6 +- 7 files changed, 129 insertions(+), 250 deletions(-) diff --git a/internals/daemon/daemon_test.go b/internals/daemon/daemon_test.go index 58976c09c..6751ead9f 100644 --- a/internals/daemon/daemon_test.go +++ b/internals/daemon/daemon_test.go @@ -40,7 +40,6 @@ import ( "github.com/canonical/pebble/internals/overlord" "github.com/canonical/pebble/internals/overlord/patch" "github.com/canonical/pebble/internals/overlord/restart" - "github.com/canonical/pebble/internals/overlord/servstate" "github.com/canonical/pebble/internals/overlord/standby" "github.com/canonical/pebble/internals/overlord/state" "github.com/canonical/pebble/internals/reaper" @@ -1149,7 +1148,7 @@ services: // We have to wait for it be in running state. for i := 0; ; i++ { if i >= 25 { - c.Fatalf("timed out waiting for service to start") + c.Fatalf("timed out waiting or service to start") } d.state.Lock() change := d.state.Change(rsp.Change) @@ -1284,7 +1283,7 @@ services: // We have to wait for it be in running state for StopRunning to stop it. for i := 0; ; i++ { if i >= 25 { - c.Fatalf("timed out waiting for service to start") + c.Fatalf("timed out waiting or service to start") } d.state.Lock() change := d.state.Change(rsp.Change) @@ -1318,65 +1317,6 @@ services: c.Check(tasks[0].Kind(), Equals, "stop") } -func (s *daemonSuite) TestStopWithinOkayDelay(c *C) { - // Start the daemon. - writeTestLayer(s.pebbleDir, ` -services: - test1: - override: replace - command: sleep 10 -`) - d := s.newDaemon(c) - err := d.Init() - c.Assert(err, IsNil) - c.Assert(d.Start(), IsNil) - - // Start the test service. - payload := bytes.NewBufferString(`{"action": "start", "services": ["test1"]}`) - req, err := http.NewRequest("POST", "/v1/services", payload) - c.Assert(err, IsNil) - rsp := v1PostServices(apiCmd("/v1/services"), req, nil).(*resp) - rec := httptest.NewRecorder() - rsp.ServeHTTP(rec, req) - c.Check(rec.Result().StatusCode, Equals, 202) - - // Waiting for the change to be in doing state cannot guarantee the service is - // in the starting state, so here we wait until the service is in the starting - // state. We wait up to 25*20=500ms to make sure there is still half a second - // left to stop the service before okayDelay. - for i := 0; i < 25; i++ { - svcInfo, err := d.overlord.ServiceManager().Services([]string{"test1"}) - c.Assert(err, IsNil) - if len(svcInfo) > 0 && svcInfo[0].Current == servstate.StatusActive { - break - } - time.Sleep(20 * time.Millisecond) - } - - // Stop the daemon, which should stop services in starting state. At this point, - // it should still be within the okayDelay. - err = d.Stop(nil) - c.Assert(err, IsNil) - - // Ensure a "stop" change is created, along with its "stop" tasks. - d.state.Lock() - defer d.state.Unlock() - changes := d.state.Changes() - var change *state.Change - for _, ch := range changes { - if ch.Kind() == "stop" { - change = ch - } - } - if change == nil { - c.Fatalf("stop change not found") - } - c.Check(change.Status(), Equals, state.DoneStatus) - tasks := change.Tasks() - c.Assert(tasks, HasLen, 1) - c.Check(tasks[0].Kind(), Equals, "stop") -} - func (s *daemonSuite) TestWritesRequireAdminAccess(c *C) { for _, cmd := range API { if cmd.Path == "/v1/notices" { diff --git a/internals/overlord/servstate/handlers.go b/internals/overlord/servstate/handlers.go index 24172e0c7..2bcdd6945 100644 --- a/internals/overlord/servstate/handlers.go +++ b/internals/overlord/servstate/handlers.go @@ -682,10 +682,6 @@ func (s *serviceData) stop() error { defer s.manager.servicesLock.Unlock() switch s.state { - case stateStarting: - s.started <- fmt.Errorf("stopped before the %s okay delay", okayDelay) - fallthrough - case stateRunning: logger.Debugf("Attempting to stop service %q by sending SIGTERM", s.config.Name) // First send SIGTERM to try to terminate it gracefully. diff --git a/internals/overlord/servstate/manager.go b/internals/overlord/servstate/manager.go index ac22d38f7..43a379d27 100644 --- a/internals/overlord/servstate/manager.go +++ b/internals/overlord/servstate/manager.go @@ -367,7 +367,7 @@ func servicesToStop(m *ServiceManager) ([][]string, error) { return nil, err } - // Filter down to only those that are starting, running or in backoff + // Filter down to only those that are running or in backoff m.servicesLock.Lock() defer m.servicesLock.Unlock() var result [][]string @@ -375,7 +375,7 @@ func servicesToStop(m *ServiceManager) ([][]string, error) { var notStopped []string for _, name := range services { s := m.services[name] - if s != nil && (s.state == stateStarting || s.state == stateRunning || s.state == stateBackoff) { + if s != nil && (s.state == stateRunning || s.state == stateBackoff) { notStopped = append(notStopped, name) } } diff --git a/internals/overlord/servstate/manager_test.go b/internals/overlord/servstate/manager_test.go index 3a8c9d30d..c097336ca 100644 --- a/internals/overlord/servstate/manager_test.go +++ b/internals/overlord/servstate/manager_test.go @@ -297,58 +297,6 @@ services: c.Assert(s.manager.StopTimeout(), Equals, time.Minute*60+time.Millisecond*200) } -func (s *S) TestStopServiceWithinOkayDelay(c *C) { - // A longer okayDelay is used so that the change for starting the service won't - // quickly transition into the running state. - fakeOkayDelay := 20 * shortOkayDelay - servstate.FakeOkayWait(fakeOkayDelay) - - s.newServiceManager(c) - serviceName := "test-stop-within-okaywait" - // The service sleeps for fakeOkayDelay second then creates a side effect (a file at donePath). - layer := ` -services: - %s: - override: replace - command: /bin/sh -c "sleep %g; {{.NotifyDoneCheck}}" -` - s.planAddLayer(c, fmt.Sprintf(layer, serviceName, fakeOkayDelay.Seconds())) - s.planChanged(c) - - // Start the service without waiting for change ready. - s.st.Lock() - ts, err := servstate.Start(s.st, [][]string{{serviceName}}) - c.Check(err, IsNil) - chgStart := s.st.NewChange("test", "Start test") - chgStart.AddAll(ts) - s.st.Unlock() - s.runner.Ensure() - - // Wait until the service is in the starting state. - s.waitUntilService(c, serviceName, func(service *servstate.ServiceInfo) bool { - return service.Current == servstate.StatusActive - }) - - // Stop the service within okayDelay. - chg := s.stopServices(c, [][]string{{serviceName}}) - s.st.Lock() - c.Assert(chg.Err(), IsNil) - s.st.Unlock() - - waitChangeReady(c, s.runner, chgStart, "Start test") - - s.st.Lock() - c.Check(chgStart.Status(), Equals, state.ErrorStatus) - c.Check(chgStart.Err(), ErrorMatches, fmt.Sprintf(`(?s).*stopped before the %s okay delay.*`, fakeOkayDelay)) - s.st.Unlock() - - donePath := filepath.Join(s.dir, serviceName) - // If the service is stopped within okayDelay and is indeed terminated, donePath should not exist. - if _, err := os.Stat(donePath); err == nil { - c.Fatalf("service %s waiting for service output", serviceName) - } -} - func (s *S) TestKillDelayIsUsed(c *C) { s.newServiceManager(c) s.planAddLayer(c, testPlanLayer) diff --git a/internals/overlord/servstate/state-diagram.dot b/internals/overlord/servstate/state-diagram.dot index 63f939142..7954864dc 100644 --- a/internals/overlord/servstate/state-diagram.dot +++ b/internals/overlord/servstate/state-diagram.dot @@ -7,7 +7,6 @@ digraph service_state_machine { node [penwidth=1] initial -> starting [label="start"] starting -> running [label="okay wait\nelapsed"] - starting -> terminating [label="stop (before\nokay wait elapses)"] running -> terminating [label="stop"] running -> terminating [label="check failed\n(action \"restart\")"] terminating -> killing [label="terminate time\nelapsed"] diff --git a/internals/overlord/servstate/state-diagram.svg b/internals/overlord/servstate/state-diagram.svg index 44795c88b..4dd2f2bd3 100644 --- a/internals/overlord/servstate/state-diagram.svg +++ b/internals/overlord/servstate/state-diagram.svg @@ -1,229 +1,221 @@ - - - + + service_state_machine - + initial - -initial + +initial starting - -starting + +starting initial->starting - - -start + + +start running - -running + +running starting->running - - -okay wait -elapsed - - - -terminating - -terminating - - - -starting->terminating - - -stop (before -okay wait elapses) + + +okay wait +elapsed exited - -exited + +exited - + starting->exited - - -exited + + +exited + + + +terminating + +terminating - + running->terminating - - -stop + + +stop - + running->terminating - - -check failed -(action "restart") + + +check failed +(action "restart") backoff - -backoff + +backoff - + running->backoff - - -exited -(action "restart") + + +exited +(action "restart") - + running->exited - - -exited -(action "ignore") + + +exited +(action "ignore") - + running->exited - - -exited -(action "shutdown") + + +exited +(action "shutdown") killing - -killing + +killing - + terminating->killing - - -terminate time -elapsed + + +terminate time +elapsed stopped - -stopped + +stopped - + terminating->stopped - - -exited -(not restarting) + + +exited +(not restarting) - + terminating->backoff - - -exited -(restarting) + + +exited +(restarting) - + killing->stopped - - -exited -(not restarting) + + +exited +(not restarting) - + killing->stopped - - -kill time -elapsed + + +kill time +elapsed - + killing->backoff - - -exited -(restarting) + + +exited +(restarting) - + stopped->starting - - -start + + +start - + backoff->starting - - -start + + +start - + backoff->running - - -backoff time -elapsed + + +backoff time +elapsed - + backoff->stopped - - -stop + + +stop - + exited->starting - - -start + + +start - + exited->stopped - - -stop + + +stop - + exited->backoff - - -check failed -(action "restart") + + +check failed +(action "restart") diff --git a/tests/run_test.go b/tests/run_test.go index d9b0073e4..c98c6937e 100644 --- a/tests/run_test.go +++ b/tests/run_test.go @@ -49,7 +49,9 @@ services: createLayer(t, pebbleDir, "001-simple-layer.yaml", layerYAML) - _, _ = pebbleRun(t, pebbleDir) + _, stderrCh := pebbleRun(t, pebbleDir) + waitForLog(t, stderrCh, "pebble", "Started default services", 3*time.Second) + waitForFile(t, filepath.Join(pebbleDir, "svc1"), 3*time.Second) waitForFile(t, filepath.Join(pebbleDir, "svc2"), 3*time.Second) } @@ -140,6 +142,7 @@ services: stdoutCh, stderrCh := pebbleRun(t, pebbleDir, "--verbose") waitForLog(t, stderrCh, "pebble", "Started daemon", 3*time.Second) waitForLog(t, stdoutCh, "svc1", "hello world", 3*time.Second) + waitForLog(t, stderrCh, "pebble", "Started default services", 3*time.Second) } // TestArgs tests that Pebble provides additional arguments to a service @@ -165,6 +168,7 @@ services: ) waitForLog(t, stderrCh, "pebble", "Started daemon", 3*time.Second) waitForLog(t, stdoutCh, "svc1", "hello world", 3*time.Second) + waitForLog(t, stderrCh, "pebble", "Started default services", 3*time.Second) } // TestIdentities tests that Pebble seeds identities from a file