Skip to content

Commit

Permalink
test: run prometheus in tests and check metrics after talemu tests
Browse files Browse the repository at this point in the history
Make the tests fail if the metrics do not meet the expected thresholds.

Signed-off-by: Artem Chernyshev <[email protected]>
  • Loading branch information
Unix4ever committed Jul 29, 2024
1 parent 111796a commit 60355b6
Show file tree
Hide file tree
Showing 6 changed files with 180 additions and 2 deletions.
132 changes: 132 additions & 0 deletions cmd/integration-test/pkg/tests/stats.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@
// Copyright (c) 2024 Sidero Labs, Inc.
//
// Use of this software is governed by the Business Source License
// included in the LICENSE file.

package tests

import (
"context"
"fmt"
"sort"
"strings"
"testing"
"time"

"github.com/prometheus/client_golang/api"
v1 "github.com/prometheus/client_golang/api/prometheus/v1"
"github.com/prometheus/common/model"
"github.com/siderolabs/go-retry/retry"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)

// AssertStatsLimits checks that metrics don't show any spikes of resource reads/writes, controller wakeups.
// This test should only be run after the integration tests set with Talemu enabled as the thresholds are adjusted for it.
// Should have Prometheus running on 9090.
func AssertStatsLimits(testCtx context.Context) TestFunc {
return func(t *testing.T) {
for _, tt := range []struct {
check func(assert *assert.Assertions, value float64)
name string
query string
}{
{
name: "resource CRUD",
query: `sum(omni_resource_operations_total{operation=~"create|update", type!="MachineStatusLinks.omni.sidero.dev"})`,
check: func(assert *assert.Assertions, value float64) { assert.Less(value, float64(10000)) },
},
{
name: "queue length",
query: `sum(omni_runtime_qcontroller_queue_length)`,
check: func(assert *assert.Assertions, value float64) { assert.Zero(value) },
},
{
name: "controller wakeups",
query: `sum(omni_runtime_controller_wakeups{controller!="MachineStatusLinkController"})`,
check: func(assert *assert.Assertions, value float64) { assert.Less(value, float64(10000)) },
},
} {
t.Run(tt.name, func(t *testing.T) {
t.Parallel()

ctx, cancel := context.WithTimeout(testCtx, time.Second*16)
defer cancel()

err := retry.Constant(time.Second * 15).Retry(func() error {
promClient, err := api.NewClient(api.Config{
Address: "http://127.0.0.1:9090",
})
if err != nil {
return retry.ExpectedError(err)
}

var (
value model.Value
warnings v1.Warnings
)

agg := assertionAggregator{}

v1api := v1.NewAPI(promClient)

value, warnings, err = v1api.Query(ctx, tt.query, time.Now())
if err != nil {
return retry.ExpectedError(err)
}

if len(warnings) > 0 {
return retry.ExpectedErrorf("prometheus query had warnings %#v", warnings)
}

assert := assert.New(&agg)

switch val := value.(type) {
case *model.Scalar:
tt.check(assert, float64(val.Value))
case model.Vector:
tt.check(assert, float64(val[val.Len()-1].Value))
default:
return fmt.Errorf("unexpected value type %s", val.Type())
}

if agg.hadErrors {
return retry.ExpectedErrorf(agg.String())
}

return nil
})

require.NoError(t, err)
})
}
}
}

type assertionAggregator struct {
errors map[string]struct{}
hadErrors bool
}

func (agg *assertionAggregator) Errorf(format string, args ...any) {
errorString := fmt.Sprintf(format, args...)

if agg.errors == nil {
agg.errors = map[string]struct{}{}
}

agg.errors[errorString] = struct{}{}
agg.hadErrors = true
}

func (agg *assertionAggregator) String() string {
lines := make([]string, 0, len(agg.errors))

for errorString := range agg.errors {
lines = append(lines, " * "+errorString)
}

sort.Strings(lines)

return strings.Join(lines, "\n")
}
26 changes: 24 additions & 2 deletions cmd/integration-test/pkg/tests/tests.go
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ type Options struct {
RunTestPattern string

CleanupLinks bool
RunStatsCheck bool
ExpectedMachines int

RestartAMachineFunc RestartAMachineFunc
Expand Down Expand Up @@ -1147,6 +1148,25 @@ Test flow of cluster creation and scaling using cluster templates.`,
nil,
).Run()

extraTests := []testing.InternalTest{}

if options.RunStatsCheck {
extraTests = append(extraTests, testing.InternalTest{
Name: "AssertStatsLimits",
F: AssertStatsLimits(ctx),
})
}

if len(extraTests) > 0 && exitCode == 0 {
exitCode = testing.MainStart(
matchStringOnly(func(string, string) (bool, error) { return true, nil }),
extraTests,
nil,
nil,
nil,
).Run()
}

if options.CleanupLinks {
if err := cleanupLinks(ctx, rootClient.Omni().State()); err != nil {
return err
Expand Down Expand Up @@ -1191,8 +1211,8 @@ func cleanupLinks(ctx context.Context, st state.State) error {
})
}

func makeTests(ctx context.Context, testsToRun []testGroup, machineSemaphore *semaphore.Weighted) []testing.InternalTest {
return xslices.Map(testsToRun, func(group testGroup) testing.InternalTest {
func makeTests(ctx context.Context, testsToRun []testGroup, machineSemaphore *semaphore.Weighted, tests ...testing.InternalTest) []testing.InternalTest {
groups := xslices.Map(testsToRun, func(group testGroup) testing.InternalTest {
return testing.InternalTest{
Name: group.Name,
F: func(t *testing.T) {
Expand Down Expand Up @@ -1237,6 +1257,8 @@ func makeTests(ctx context.Context, testsToRun []testGroup, machineSemaphore *se
},
}
})

return append(groups, tests...)
}

//nolint:govet
Expand Down
3 changes: 3 additions & 0 deletions cmd/integration-test/root.go
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ var rootCmd = &cobra.Command{

ExpectedMachines: rootCmdFlags.expectedMachines,
CleanupLinks: rootCmdFlags.cleanupLinks,
RunStatsCheck: rootCmdFlags.runStatsCheck,

MachineOptions: rootCmdFlags.machineOptions,
AnotherTalosVersion: rootCmdFlags.anotherTalosVersion,
Expand Down Expand Up @@ -115,6 +116,7 @@ var rootCmdFlags struct {
expectedMachines int
parallel int64
cleanupLinks bool
runStatsCheck bool

testsTimeout time.Duration

Expand Down Expand Up @@ -151,6 +153,7 @@ func init() {
rootCmd.Flags().Int64VarP(&rootCmdFlags.parallel, "parallel", "p", 4, "tests parallelism")
rootCmd.Flags().DurationVarP(&rootCmdFlags.testsTimeout, "timeout", "t", time.Hour, "tests global timeout")
rootCmd.Flags().BoolVar(&rootCmdFlags.cleanupLinks, "cleanup-links", false, "remove all links after the tests are complete")
rootCmd.Flags().BoolVar(&rootCmdFlags.runStatsCheck, "run-stats-check", false, "runs stats check after the test is complete")
}

// withContext wraps with CLI context.
Expand Down
10 changes: 10 additions & 0 deletions hack/compose/docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,16 @@ services:
MINIO_SECRET_KEY: secret123
command: server /minio-server/export

prometheus:
network_mode: host
image: prom/prometheus
depends_on:
- omni
ports:
- "9090:9090"
volumes:
- ./prometheus/prometheus.yml:/etc/prometheus/prometheus.yml

minio-mc:
image: minio/mc:latest
container_name: minio-mc
Expand Down
7 changes: 7 additions & 0 deletions hack/compose/prometheus/prometheus.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
global:
scrape_interval: 5s
evaluation_interval: 5s
scrape_configs:
- job_name: "omni"
static_configs:
- targets: ["127.0.0.1:2122"]
4 changes: 4 additions & 0 deletions hack/test/integration.sh
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,8 @@ nice -n 10 ${ARTIFACTS}/omni-linux-amd64 \
KERNEL_ARGS="siderolink.api=grpc://$LOCAL_IP:8090?jointoken=${JOIN_TOKEN} talos.events.sink=[fdae:41e4:649b:9303::1]:8090 talos.logging.kernel=tcp://[fdae:41e4:649b:9303::1]:8092"

if [[ "${RUN_TALEMU_TESTS:-false}" == "true" ]]; then
PROMETHEUS_CONTAINER=$(docker run --network host -p "9090:9090" -v "$(pwd)/hack/compose/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml" -it --rm -d prom/prometheus)

TALEMU_CONTAINER=$(docker run --network host --cap-add=NET_ADMIN -it --rm -d ghcr.io/siderolabs/talemu:latest --kernel-args="${KERNEL_ARGS}" --machines=30)

sleep 10
Expand All @@ -126,12 +128,14 @@ if [[ "${RUN_TALEMU_TESTS:-false}" == "true" ]]; then
--omnictl-path=${ARTIFACTS}/omnictl-linux-amd64 \
--expected-machines=30 \
--cleanup-links \
--run-stats-check \
-t 4m \
-p 10 \
${TALEMU_TEST_ARGS:-}

docker stop $TALEMU_CONTAINER
docker rm -f $TALEMU_CONTAINER
docker rm -f $PROMETHEUS_CONTAINER
fi

# Prepare partial machine config
Expand Down

0 comments on commit 60355b6

Please sign in to comment.