Skip to content

Commit

Permalink
Only calculate build info once (#4214)
Browse files Browse the repository at this point in the history
This commit fixes a bug where build information is calculated for each
keepalive generated. This generates some additional work for the agent
which should be marginal in all but the most extreme cases.

The bug manifests itself more clearly when using the loadit tool for
load testing, consuming a large fraction of the CPU when 10,000 agents
are loaded.

Signed-off-by: Eric Chlebek <[email protected]>
  • Loading branch information
echlebek authored Mar 1, 2021
1 parent 2a5d0bd commit 28493aa
Show file tree
Hide file tree
Showing 10 changed files with 101 additions and 162 deletions.
10 changes: 5 additions & 5 deletions .github/workflows/golangci-lint.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ jobs:
- name: golangci-lint
uses: golangci/golangci-lint-action@v2
with:
version: v1.29
version: v1.37.1
args: --timeout=5m
lint-api-core-v2-mod:
name: lint-api-core-v2-mod
Expand All @@ -31,7 +31,7 @@ jobs:
- name: golangci-lint
uses: golangci/golangci-lint-action@v2
with:
version: v1.29
version: v1.37.1
working-directory: api/core/v2
args: --timeout=5m
lint-api-core-v3-mod:
Expand All @@ -48,7 +48,7 @@ jobs:
- name: golangci-lint
uses: golangci/golangci-lint-action@v2
with:
version: v1.29
version: v1.37.1
working-directory: api/core/v3
args: --timeout=5m
lint-backend-store-v2-mod:
Expand All @@ -62,7 +62,7 @@ jobs:
- name: golangci-lint
uses: golangci/golangci-lint-action@v2
with:
version: v1.29
version: v1.37.1
working-directory: backend/store/v2
args: --timeout=5m
lint-types-mod:
Expand All @@ -76,6 +76,6 @@ jobs:
- name: golangci-lint
uses: golangci/golangci-lint-action@v2
with:
version: v1.29
version: v1.37.1
working-directory: types
args: --timeout=5m
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@ Versioning](http://semver.org/spec/v2.0.0.html).
defined.
- Fixed a bug where the scheduler could crash in rare circumstances, when using
round robin checks.
- Fixed a bug where build information would get calculated for every keepalive
in OSS builds.

## [6.2.2] - 2021-01-14

Expand Down
74 changes: 74 additions & 0 deletions agent/agent.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@ import (
time "github.com/echlebek/timeproxy"
"github.com/gogo/protobuf/proto"
"github.com/google/uuid"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promhttp"
"golang.org/x/time/rate"

corev2 "github.com/sensu/sensu-go/api/core/v2"
Expand All @@ -43,6 +45,64 @@ const (
entityConfigGracePeriod = 10 * time.Second
)

const (
MessagesReceived = "sensu_go_agent_messages_received"
MessagesSent = "sensu_go_agent_messages_sent"
MessagesDropped = "sensu_go_agent_messages_dropped"
NewConnections = "sensu_go_agent_new_connections"
WebsocketErrors = "sensu_go_agent_websocket_errors"
)

var (
messagesReceived = prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: MessagesReceived,
Help: "The total number of messages received from sensu-backend",
},
[]string{},
)

messagesSent = prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: MessagesSent,
Help: "The total number of messages sent to sensu-backend",
},
[]string{},
)

messagesDropped = prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: MessagesDropped,
Help: "The total number of messages that failed to send to sensu-backend",
},
[]string{},
)

newConnections = prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: NewConnections,
Help: "The total number of new connections made to sensu-backend",
},
[]string{},
)

websocketErrors = prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: WebsocketErrors,
Help: "The total number of websocket errors encountered",
},
[]string{},
)
)

func init() {
_ = prometheus.Register(messagesReceived)
_ = prometheus.Register(messagesSent)
_ = prometheus.Register(messagesDropped)
_ = prometheus.Register(newConnections)
_ = prometheus.Register(websocketErrors)
}

// GetDefaultAgentName returns the default agent name
func GetDefaultAgentName() string {
defaultAgentName, err := os.Hostname()
Expand Down Expand Up @@ -142,6 +202,12 @@ func NewAgentContext(ctx context.Context, config *Config) (*Agent, error) {
}
agent.allowList = allowList

if config.PrometheusBinding != "" {
go func() {
logger.WithError(http.ListenAndServe(config.PrometheusBinding, promhttp.Handler())).Error("couldn't serve prometheus metrics")
}()
}

return agent, nil
}

Expand Down Expand Up @@ -379,6 +445,8 @@ func (a *Agent) connectionManager(ctx context.Context, cancel context.CancelFunc
a.connected = true
a.connectedMu.Unlock()

newConnections.WithLabelValues().Inc()

go a.receiveLoop(ctx, cancel, conn)

// Block until we receive an entity config, or the grace period expires,
Expand Down Expand Up @@ -413,6 +481,7 @@ func (a *Agent) receiveLoop(ctx context.Context, cancel context.CancelFunc, conn
logger.WithError(err).Error("transport receive error")
return
}
messagesReceived.WithLabelValues().Inc()

go func(msg *transport.Message) {
logger.WithFields(logrus.Fields{
Expand Down Expand Up @@ -460,14 +529,18 @@ func (a *Agent) sendLoop(ctx context.Context, cancel context.CancelFunc, conn tr
return nil
case msg := <-a.sendq:
if err := conn.Send(msg); err != nil {
messagesDropped.WithLabelValues().Inc()
logger.WithError(err).Error("error sending message over websocket")
return err
}
messagesSent.WithLabelValues().Inc()
case <-keepalive.C:
if err := conn.Send(a.newKeepalive()); err != nil {
messagesDropped.WithLabelValues().Inc()
logger.WithError(err).Error("error sending message over websocket")
return err
}
messagesSent.WithLabelValues().Inc()
}
}
}
Expand Down Expand Up @@ -606,6 +679,7 @@ func (a *Agent) connectWithBackoff(ctx context.Context) (transport.Transport, er
logger.WithField("header", fmt.Sprintf("Accept: %s", agentd.ProtobufSerializationHeader)).Debug("setting header")
c, respHeader, err := transport.Connect(backendURL, a.config.TLS, a.header, a.config.BackendHandshakeTimeout)
if err != nil {
websocketErrors.WithLabelValues().Inc()
logger.WithError(err).Error("reconnection attempt failed")
return false, nil
}
Expand Down
5 changes: 4 additions & 1 deletion agent/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ const (

// TCPSocketReadDeadline specifies the maximum time the TCP socket will wait
// to receive data.
TCPSocketReadDeadline = 500 * time.Millisecond
TCPSocketReadDeadline = 5000 * time.Millisecond

// DefaultAPIHost specifies the default API Host
DefaultAPIHost = "127.0.0.1"
Expand Down Expand Up @@ -195,6 +195,9 @@ type Config struct {
// AgentManagedEntity indicates whether the agent's entity is solely managed
// by the agent, rather than the backend API
AgentManagedEntity bool

// PrometheusBinding, if set, serves prometheus metrics on this address. (e.g. localhost:8888)
PrometheusBinding string
}

// StatsdServerConfig contains the statsd server configuration
Expand Down
Loading

0 comments on commit 28493aa

Please sign in to comment.