Skip to content

Commit

Permalink
fix(daemon): avoid acquiring state lock in health check endpoint (#369)
Browse files Browse the repository at this point in the history
I don't love the addition of the (exported) State.LockCount method, but
I couldn't think of another way to ensure that State.Lock was not being
called, apart from significant surgery allowing us to swap out the
state lock with an interface.

In addition, I'm not sure we need the maintenance/warnings stuff at all
in Pebble, as they're not used, but leaving that for a separate
discussion and PR.

I tested by saving the script https://go.dev/play/p/LJDLEaXeBpd to
cmd/hithealth/main.go, running a Pebble server, and then running a
task to heavily load my CPU cores (though the latter doesn't seem to
make much of a difference!).

Without the fix, GET /v1/health response times are all over 20ms, and
commonly in the 100-300ms range. In other words, very long for a
do-nothing health check endpoint. This is because the other clients
(in the hithealth script) are starting services which modifies state
and holds the state lock for a relatively long time.

$ go run ./cmd/hithealth/
     0 > 20ms: 0.028897s
     1 > 20ms: 0.192796s
     2 > 20ms: 0.268904s
     3 > 20ms: 0.082985s
     4 > 20ms: 0.030554s
     5 > 20ms: 0.156912s
     6 > 20ms: 0.245212s
     7 > 20ms: 0.047436s
     8 > 20ms: 0.099474s
     9 > 20ms: 0.070440s
    10 > 20ms: 0.060641s
    11 > 20ms: 0.202535s
    12 > 20ms: 0.206226s
    ...

With the fix, GET /v1/health doesn't hold the state lock at all.
Response times are only over 20ms every 10,000 or so requests, and
never above 100ms even under heavy load:

$ go run ./cmd/hithealth/
 13891 > 20ms: 0.023703s
 15923 > 20ms: 0.024769s
 21915 > 20ms: 0.076423s
 ...
  • Loading branch information
benhoyt committed Feb 28, 2024
1 parent 143329a commit c44c1eb
Show file tree
Hide file tree
Showing 3 changed files with 91 additions and 4 deletions.
34 changes: 30 additions & 4 deletions internals/daemon/api_health.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
package daemon

import (
"encoding/json"
"net/http"

"github.com/canonical/x-go/strutil"
Expand Down Expand Up @@ -57,9 +58,34 @@ func v1Health(c *Command, r *http.Request, _ *UserState) Response {
}
}

return SyncResponse(&resp{
Type: ResponseTypeSync,
Status: status,
Result: healthInfo{Healthy: healthy},
return SyncResponse(&healthResp{
Type: ResponseTypeSync,
Status: status,
StatusText: http.StatusText(status),
Result: healthInfo{Healthy: healthy},
})
}

// Like the resp struct, but without the warning/maintenance fields, so that
// the health endpoint doesn't have to acquire the state lock (resulting in a
// slow response on heavily-loaded systems).
type healthResp struct {
Type ResponseType `json:"type"`
Status int `json:"status-code"`
StatusText string `json:"status,omitempty"`
Result interface{} `json:"result,omitempty"`
}

func (r *healthResp) ServeHTTP(w http.ResponseWriter, _ *http.Request) {
status := r.Status
bs, err := json.Marshal(r)
if err != nil {
logger.Noticef("Cannot marshal %#v to JSON: %v", *r, err)
bs = nil
status = http.StatusInternalServerError
}

w.Header().Set("Content-Type", "application/json")
w.WriteHeader(status)
w.Write(bs)
}
51 changes: 51 additions & 0 deletions internals/daemon/api_health_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,11 @@ import (
"io"
"net/http"
"net/http/httptest"
"time"

. "gopkg.in/check.v1"

"github.com/canonical/pebble/client"
"github.com/canonical/pebble/internals/overlord"
"github.com/canonical/pebble/internals/overlord/checkstate"
"github.com/canonical/pebble/internals/plan"
Expand Down Expand Up @@ -206,6 +208,55 @@ func (s *healthSuite) TestChecksError(c *C) {
})
}

// Ensure state lock is not held at all for GET /v1/health requests.
// Regression test for issue described at:
//
// - https://github.com/canonical/pebble/issues/366
// - https://bugs.launchpad.net/juju/+bug/2052517
func (s *apiSuite) TestStateLockNotHeld(c *C) {
daemonOpts := &Options{
Dir: s.pebbleDir,
SocketPath: s.pebbleDir + ".pebble.socket",
}
daemon, err := New(daemonOpts)
c.Assert(err, IsNil)
c.Assert(daemon.Init(), IsNil)
c.Assert(daemon.Start(), IsNil)
defer func() {
c.Assert(daemon.Stop(nil), IsNil)
}()

// Wait for lock count to stabilise (daemon starts goroutine(s) that use
// it on startup). Normally it will stabilise almost instantly.
prevCount := daemon.state.LockCount()
for i := 0; i < 50; i++ {
if i == 49 {
c.Fatal("timed out waiting for lock count to stabilise")
}
time.Sleep(100 * time.Millisecond)
newCount := daemon.state.LockCount()
if newCount == prevCount {
break
}
prevCount = newCount
}

// Use real HTTP client so we're exercising the full ServeHTTP flow.
// Could use daemon.serve.Handler directly, but this seems even better.
pebble, err := client.New(&client.Config{
Socket: daemonOpts.SocketPath,
})
c.Assert(err, IsNil)

// Ensure lock count doesn't change during GET /v1/health request.
before := daemon.state.LockCount()
healthy, err := pebble.Health(&client.HealthOptions{})
c.Assert(err, IsNil)
c.Assert(healthy, Equals, true)
after := daemon.state.LockCount()
c.Assert(after, Equals, before)
}

func serveHealth(c *C, method, url string, body io.Reader) (int, map[string]interface{}) {
recorder := httptest.NewRecorder()
request, err := http.NewRequest(method, url, body)
Expand Down
10 changes: 10 additions & 0 deletions internals/overlord/state/state.go
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,8 @@ type State struct {
mu sync.Mutex
muC int32

lockCount int32 // used only for testing

lastTaskId int
lastChangeId int
lastLaneId int
Expand Down Expand Up @@ -128,6 +130,14 @@ func (s *State) Modified() bool {
func (s *State) Lock() {
s.mu.Lock()
atomic.AddInt32(&s.muC, 1)
atomic.AddInt32(&s.lockCount, 1)
}

// LockCount returns the number of times the state lock was held.
//
// NOTE: This needs to be exported, but should only be used in testing.
func (s *State) LockCount() int {
return int(atomic.LoadInt32(&s.lockCount))
}

func (s *State) reading() {
Expand Down

0 comments on commit c44c1eb

Please sign in to comment.