Skip to content

Commit

Permalink
fix: allow stopping services in "starting" state (#503)
Browse files Browse the repository at this point in the history
Pebble has a 1-second okayWait time period for starting services, after
which, if the service is still not exited, it's considered as "running".
Previously, when services were still in the starting state within this
1-second okayWait period, they couldn't be stopped. This causes some
issues in tests. And, just because services have just started doesn't
mean they shouldn't be allowed to stop. This PR allows services in the
starting state (within the 1s okayWait period) to be stopped, no matter
if it's the daemon exits (Ctrl+C in the console) or the `pebble stop`
command is issued.
  • Loading branch information
IronCore864 authored Oct 10, 2024
1 parent f4e64ec commit acaacb2
Show file tree
Hide file tree
Showing 7 changed files with 250 additions and 129 deletions.
64 changes: 62 additions & 2 deletions internals/daemon/daemon_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ import (
"github.com/canonical/pebble/internals/overlord"
"github.com/canonical/pebble/internals/overlord/patch"
"github.com/canonical/pebble/internals/overlord/restart"
"github.com/canonical/pebble/internals/overlord/servstate"
"github.com/canonical/pebble/internals/overlord/standby"
"github.com/canonical/pebble/internals/overlord/state"
"github.com/canonical/pebble/internals/reaper"
Expand Down Expand Up @@ -1148,7 +1149,7 @@ services:
// We have to wait for it be in running state.
for i := 0; ; i++ {
if i >= 25 {
c.Fatalf("timed out waiting or service to start")
c.Fatalf("timed out waiting for service to start")
}
d.state.Lock()
change := d.state.Change(rsp.Change)
Expand Down Expand Up @@ -1283,7 +1284,7 @@ services:
// We have to wait for it be in running state for StopRunning to stop it.
for i := 0; ; i++ {
if i >= 25 {
c.Fatalf("timed out waiting or service to start")
c.Fatalf("timed out waiting for service to start")
}
d.state.Lock()
change := d.state.Change(rsp.Change)
Expand Down Expand Up @@ -1317,6 +1318,65 @@ services:
c.Check(tasks[0].Kind(), Equals, "stop")
}

func (s *daemonSuite) TestStopWithinOkayDelay(c *C) {
// Start the daemon.
writeTestLayer(s.pebbleDir, `
services:
test1:
override: replace
command: sleep 10
`)
d := s.newDaemon(c)
err := d.Init()
c.Assert(err, IsNil)
c.Assert(d.Start(), IsNil)

// Start the test service.
payload := bytes.NewBufferString(`{"action": "start", "services": ["test1"]}`)
req, err := http.NewRequest("POST", "/v1/services", payload)
c.Assert(err, IsNil)
rsp := v1PostServices(apiCmd("/v1/services"), req, nil).(*resp)
rec := httptest.NewRecorder()
rsp.ServeHTTP(rec, req)
c.Check(rec.Result().StatusCode, Equals, 202)

// Waiting for the change to be in doing state cannot guarantee the service is
// in the starting state, so here we wait until the service is in the starting
// state. We wait up to 25*20=500ms to make sure there is still half a second
// left to stop the service before okayDelay.
for i := 0; i < 25; i++ {
svcInfo, err := d.overlord.ServiceManager().Services([]string{"test1"})
c.Assert(err, IsNil)
if len(svcInfo) > 0 && svcInfo[0].Current == servstate.StatusActive {
break
}
time.Sleep(20 * time.Millisecond)
}

// Stop the daemon, which should stop services in starting state. At this point,
// it should still be within the okayDelay.
err = d.Stop(nil)
c.Assert(err, IsNil)

// Ensure a "stop" change is created, along with its "stop" tasks.
d.state.Lock()
defer d.state.Unlock()
changes := d.state.Changes()
var change *state.Change
for _, ch := range changes {
if ch.Kind() == "stop" {
change = ch
}
}
if change == nil {
c.Fatalf("stop change not found")
}
c.Check(change.Status(), Equals, state.DoneStatus)
tasks := change.Tasks()
c.Assert(tasks, HasLen, 1)
c.Check(tasks[0].Kind(), Equals, "stop")
}

func (s *daemonSuite) TestWritesRequireAdminAccess(c *C) {
for _, cmd := range API {
if cmd.Path == "/v1/notices" {
Expand Down
4 changes: 4 additions & 0 deletions internals/overlord/servstate/handlers.go
Original file line number Diff line number Diff line change
Expand Up @@ -682,6 +682,10 @@ func (s *serviceData) stop() error {
defer s.manager.servicesLock.Unlock()

switch s.state {
case stateStarting:
s.started <- fmt.Errorf("stopped before the %s okay delay", okayDelay)
fallthrough

case stateRunning:
logger.Debugf("Attempting to stop service %q by sending SIGTERM", s.config.Name)
// First send SIGTERM to try to terminate it gracefully.
Expand Down
4 changes: 2 additions & 2 deletions internals/overlord/servstate/manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -367,15 +367,15 @@ func servicesToStop(m *ServiceManager) ([][]string, error) {
return nil, err
}

// Filter down to only those that are running or in backoff
// Filter down to only those that are starting, running or in backoff
m.servicesLock.Lock()
defer m.servicesLock.Unlock()
var result [][]string
for _, services := range stop {
var notStopped []string
for _, name := range services {
s := m.services[name]
if s != nil && (s.state == stateRunning || s.state == stateBackoff) {
if s != nil && (s.state == stateStarting || s.state == stateRunning || s.state == stateBackoff) {
notStopped = append(notStopped, name)
}
}
Expand Down
52 changes: 52 additions & 0 deletions internals/overlord/servstate/manager_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -297,6 +297,58 @@ services:
c.Assert(s.manager.StopTimeout(), Equals, time.Minute*60+time.Millisecond*200)
}

func (s *S) TestStopServiceWithinOkayDelay(c *C) {
// A longer okayDelay is used so that the change for starting the service won't
// quickly transition into the running state.
fakeOkayDelay := 20 * shortOkayDelay
servstate.FakeOkayWait(fakeOkayDelay)

s.newServiceManager(c)
serviceName := "test-stop-within-okaywait"
// The service sleeps for fakeOkayDelay second then creates a side effect (a file at donePath).
layer := `
services:
%s:
override: replace
command: /bin/sh -c "sleep %g; {{.NotifyDoneCheck}}"
`
s.planAddLayer(c, fmt.Sprintf(layer, serviceName, fakeOkayDelay.Seconds()))
s.planChanged(c)

// Start the service without waiting for change ready.
s.st.Lock()
ts, err := servstate.Start(s.st, [][]string{{serviceName}})
c.Check(err, IsNil)
chgStart := s.st.NewChange("test", "Start test")
chgStart.AddAll(ts)
s.st.Unlock()
s.runner.Ensure()

// Wait until the service is in the starting state.
s.waitUntilService(c, serviceName, func(service *servstate.ServiceInfo) bool {
return service.Current == servstate.StatusActive
})

// Stop the service within okayDelay.
chg := s.stopServices(c, [][]string{{serviceName}})
s.st.Lock()
c.Assert(chg.Err(), IsNil)
s.st.Unlock()

waitChangeReady(c, s.runner, chgStart, "Start test")

s.st.Lock()
c.Check(chgStart.Status(), Equals, state.ErrorStatus)
c.Check(chgStart.Err(), ErrorMatches, fmt.Sprintf(`(?s).*stopped before the %s okay delay.*`, fakeOkayDelay))
s.st.Unlock()

donePath := filepath.Join(s.dir, serviceName)
// If the service is stopped within okayDelay and is indeed terminated, donePath should not exist.
if _, err := os.Stat(donePath); err == nil {
c.Fatalf("service %s waiting for service output", serviceName)
}
}

func (s *S) TestKillDelayIsUsed(c *C) {
s.newServiceManager(c)
s.planAddLayer(c, testPlanLayer)
Expand Down
1 change: 1 addition & 0 deletions internals/overlord/servstate/state-diagram.dot
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ digraph service_state_machine {
node [penwidth=1]
initial -> starting [label="start"]
starting -> running [label="okay wait\nelapsed"]
starting -> terminating [label="stop (before\nokay wait elapses)"]
running -> terminating [label="stop"]
running -> terminating [label="check failed\n(action \"restart\")"]
terminating -> killing [label="terminate time\nelapsed"]
Expand Down
Loading

0 comments on commit acaacb2

Please sign in to comment.