Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Automate events categorization check in integration test #487

Merged
merged 16 commits into from
Dec 5, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
16 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions ci.Jenkinsfile
Original file line number Diff line number Diff line change
Expand Up @@ -294,7 +294,8 @@ pipeline {
lock("integration-test-eks") {
/* Operator Integration Tests */
sh 'make clean-cluster'
sh 'make -C operator integration-test INTEGRATION_TEST_ARGS="-r advanced -r common-metrics"'
/* k8s-events-only and common-metrics should not be run consecutively */
sh 'make -C operator integration-test INTEGRATION_TEST_ARGS="-r k8s-events-only -r advanced -r common-metrics"'
sh 'CLEAN_CLUSTER_ARGS="-n" make clean-cluster'
}
}
Expand Down Expand Up @@ -330,7 +331,8 @@ pipeline {
lock("integration-test-aks") {
/* Operator Integration Tests */
sh 'make clean-cluster'
sh 'make -C operator integration-test INTEGRATION_TEST_ARGS="-r validation-errors -r validation-legacy -r validation-errors-preprocessor-rules -r allow-legacy-install -r common-metrics"'
/* k8s-events-only and common-metrics should not be run consecutively */
sh 'make -C operator integration-test INTEGRATION_TEST_ARGS="-r k8s-events-only -r validation-errors -r validation-legacy -r validation-errors-preprocessor-rules -r allow-legacy-install -r common-metrics"'
sh 'CLEAN_CLUSTER_ARGS="-n" make clean-cluster'
}
}
Expand Down
4 changes: 2 additions & 2 deletions collector/hack/test/base/test-proxy.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,8 @@ spec:
- amd64
containers:
- name: wavefront-proxy
image: projects.registry.vmware.com/tanzu_observability_keights_saas/test-proxy:2.9.10
imagePullPolicy: Always
image: projects.registry.vmware.com/tanzu_observability_keights_saas/test-proxy:2.9.14
imagePullPolicy: IfNotPresent
command: [ "/test-proxy", "-proxy", ":2878", "-logLevel", "info" ]
ports:
- name: proxy
Expand Down
18 changes: 16 additions & 2 deletions collector/plugins/events/eventrouter.go
Original file line number Diff line number Diff line change
Expand Up @@ -140,11 +140,18 @@ func (er *EventRouter) addEvent(obj interface{}, isInInitialList bool) {
ns = "default"
}

var component string
if len(e.Source.Component) > 0 {
component = e.Source.Component
} else {
component = e.ReportingController
}

tags := map[string]string{
"namespace_name": ns,
"kind": e.InvolvedObject.Kind,
"reason": e.Reason,
"component": e.Source.Component,
"component": component,
"type": e.Type,
"important": e.Annotations["important"],
}
Expand All @@ -170,9 +177,16 @@ func (er *EventRouter) addEvent(obj interface{}, isInInitialList bool) {
delete(tags, "important")
sentEvents.Inc(1)

var lastTimestamp time.Time
if e.LastTimestamp.IsZero() {
lastTimestamp = e.EventTime.Time
} else {
lastTimestamp = e.LastTimestamp.Time
}

er.sink.ExportEvent(newEvent(
e.Message,
e.LastTimestamp.Time,
lastTimestamp,
e.Source.Host,
tags,
*e,
Expand Down
24 changes: 24 additions & 0 deletions collector/plugins/events/eventrouter_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,30 @@ func TestAddEvent(t *testing.T) {
require.Equal(t, "test-namespace", event.InvolvedObject.Namespace)
})

t.Run("sets the lastTimestamp and source component for FailedScheduling k8s events", func(t *testing.T) {
sink := &MockExport{}
er := NewEventRouter(fake.NewSimpleClientset(), configuration.EventsConfig{}, sink, true, testhelper.NewEmptyFakeWorkloadCache())
event := &v1.Event{
ObjectMeta: metav1.ObjectMeta{
Namespace: "test-namespace",
},
Message: "0/3 nodes are available: 3 Insufficient memory. preemption: 0/3 nodes\n are available: 3 No preemption victims found for incoming pod..",
EventTime: metav1.NewMicroTime(time.Now()),
InvolvedObject: v1.ObjectReference{
Namespace: "test-namespace",
Kind: "some-kind",
Name: "test-name",
},
Type: "Warning",
Reason: "FailedScheduling",
ReportingController: "default-scheduler",
}

er.addEvent(event, false)
require.True(t, !sink.Ts.IsZero())
require.Equal(t, "default-scheduler", sink.Annotations["component"])
})

t.Run("does not send add events for events that already existed prior to startup", func(t *testing.T) {
event := fakeEvent()
client := fake.NewSimpleClientset(event)
Expand Down
34 changes: 18 additions & 16 deletions collector/plugins/events/examples/failed_scheduling.yaml
Original file line number Diff line number Diff line change
@@ -1,29 +1,31 @@
apiVersion: v1
items:
- apiVersion: v1
count: 12
eventTime: null
firstTimestamp: "2023-11-06T21:35:02Z"
- action: Scheduling
apiVersion: v1
eventTime: "2023-12-04T21:41:46.328778Z"
firstTimestamp: null
involvedObject:
apiVersion: v1
kind: Pod
name: pod-cannot-be-scheduled
namespace: collector-targets
resourceVersion: "16226"
uid: 557e30eb-7a22-4386-a6da-5243424c49f4
resourceVersion: "27946519"
uid: 0571a69c-4259-4465-a649-baa3e12a88ee
kind: Event
lastTimestamp: "2023-11-06T21:42:03Z"
lastTimestamp: null
message: '0/3 nodes are available: 3 Insufficient memory. preemption: 0/3 nodes
are available: 3 No preemption victims found for incoming pod..'
metadata:
creationTimestamp: "2023-11-06T21:35:02Z"
name: pod-cannot-be-scheduled.17952642b64bb20b
creationTimestamp: "2023-12-04T21:41:46Z"
name: pod-cannot-be-scheduled.179dbee0a8bc5d36
namespace: collector-targets
resourceVersion: "865"
uid: ba7c9601-4316-4b05-b48a-40306f6a473f
resourceVersion: "51754"
uid: fb67fd57-3090-41e5-8e94-a623ccc7b07c
reason: FailedScheduling
reportingComponent: ""
reportingInstance: ""
source:
component: default-scheduler
type: Warning
reportingComponent: default-scheduler
reportingInstance: default-scheduler-gke-584afebdf6574fe9a52c-5a06-7f4f-vm
source: {}
type: Warning
kind: List
metadata:
resourceVersion: ""
Original file line number Diff line number Diff line change
Expand Up @@ -45,8 +45,8 @@ spec:
- amd64
containers:
- name: test-proxy
image: projects.registry.vmware.com/tanzu_observability_keights_saas/test-proxy:2.9.10
imagePullPolicy: Always
image: projects.registry.vmware.com/tanzu_observability_keights_saas/test-proxy:2.9.14
imagePullPolicy: IfNotPresent
command: [ "/test-proxy", "-proxy", ":2878", "-logLevel", "info" ]
ports:
- name: proxy
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,8 +41,8 @@ spec:
- amd64
containers:
- name: test-proxy
image: projects.registry.vmware.com/tanzu_observability_keights_saas/test-proxy:2.9.10
imagePullPolicy: Always
image: projects.registry.vmware.com/tanzu_observability_keights_saas/test-proxy:2.9.14
imagePullPolicy: IfNotPresent
command: [ "/test-proxy", "-proxy", ":2878", "-logLevel", "info" ]
ports:
- name: proxy
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,8 @@ spec:
- amd64
containers:
- name: test-proxy
image: projects.registry.vmware.com/tanzu_observability_keights_saas/test-proxy:2.9.10
imagePullPolicy: Always
image: projects.registry.vmware.com/tanzu_observability_keights_saas/test-proxy:2.9.14
imagePullPolicy: IfNotPresent
command: [ "/test-proxy", "-proxy", ":2878", "-logLevel", "info", "-mode", "logs", "-logFilePath" , "/logs/test-proxy.log"]
ports:
- name: proxy-metrics
Expand Down
69 changes: 52 additions & 17 deletions operator/hack/test/run-e2e-tests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -148,52 +148,87 @@ function run_proxy_checks() {
}

function run_k8s_events_checks() {
local external_event_count=0
local missing_event_categories_count
local received_event_categories_count
local external_events_fail_count
local external_events_results_file
external_events_results_file=$(mktemp)

wait_for_cluster_ready
sleep 3
start_forward_test_proxy "observability-system" "test-proxy" /dev/null
trap 'stop_forward_test_proxy /dev/null' EXIT

"$REPO_ROOT/scripts/deploy/trigger-events.sh"
"$REPO_ROOT/scripts/deploy/deploy-event-targets.sh"

local external_events_results_file
external_events_results_file=$(mktemp)
local external_event_count=0
for i in {1..60}; do
printf "Asserting external events .."
for i in {1..10}; do
while true; do # wait until we get a good connection
RES_CODE=$(curl --silent --output "$external_events_results_file" --write-out "%{http_code}" "http://localhost:8888/events/external/assert" || echo "000")
[[ $RES_CODE -lt 200 ]] || break
if [[ $RES_CODE -ge 200 ]]; then
break
fi
done

external_event_count=$(jq ".EventCount" "$external_events_results_file")

if [[ $external_event_count -gt 0 ]]; then
break
missing_event_categories_count=$(jq "(.MissingEventCategories | length)" "$external_events_results_file")
if [[ $missing_event_categories_count -eq 0 ]]; then
printf " in %d tries" "$i"
break
fi
fi

sleep 1
printf "."
sleep 3 # flush interval
done
echo " done."

echo "External events results file: $external_events_results_file"
# Helpful for debugging:
# cat "$external_events_results_file" | jq

if [[ $RES_CODE -ge 400 ]]; then
red "INVALID EXTERNAL EVENTS"
exit 1
fi

if [[ $external_event_count -eq 0 ]]; then
red "missing external events."
echo "$external_events_results_file"
red "External events were never received by test-proxy"
exit 1
fi

local external_events_fail_count
external_events_fail_count=$(jq "(.BadEventJSONs | length) + (.MissingFields | length) + (.FirstTimestampsMissing | length) + (.LastTimestampsInvalid | length)" "$external_events_results_file")
missing_event_categories_count=$(jq "(.MissingEventCategories | length)" "$external_events_results_file")
if [[ $missing_event_categories_count -gt 0 ]]; then
red "FAILED: EXPECTED EXTERNAL EVENTS WERE NOT RECEIVED"
red "Missing: $missing_event_categories_count"
red "External event categories missing:"
jq '.MissingEventCategories' "$external_events_results_file"

received_event_categories_count=$(jq "(.ReceivedEventCategories | length)" "$external_events_results_file")
red "Received: $received_event_categories_count"
if [[ $received_event_categories_count -gt 0 ]]; then
red "External event categories received:"
jq '.ReceivedEventCategories | keys' "$external_events_results_file"
fi

red "Total: $external_event_count"
exit 1
fi

echo "external events results: $external_events_results_file"
external_events_fail_count=$(jq "(.BadEventJSONs | length) + (.MissingFields | length) + (.FirstTimestampsMissing | length) + (.LastTimestampsInvalid | length)" "$external_events_results_file")
if [[ $external_events_fail_count -gt 0 ]]; then
red "BadEventJSONs: $(jq "(.BadEventJSONs | length)" "$external_events_results_file")"
red "MissingFields: $(jq "(.MissingFields | length)" "$external_events_results_file")"
red "FirstTimestampsMissing: $(jq "(.FirstTimestampsMissing | length)" "$external_events_results_file")"
red "LastTimestampsInvalid: $(jq "(.LastTimestampsInvalid | length)" "$external_events_results_file")"
if which pbcopy >/dev/null; then
echo "$external_events_results_file" | pbcopy
fi
jq '{BadEventJSONs, MissingFields, FirstTimestampsMissing, LastTimestampsInvalid}' "$external_events_results_file"
exit 1
fi

yellow "Integration test complete. $external_event_count events were received."

"$REPO_ROOT/scripts/deploy/uninstall-targets.sh"
stop_forward_test_proxy /dev/null

Expand Down
24 changes: 24 additions & 0 deletions scripts/deploy/deploy-event-targets.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
#!/usr/bin/env bash
set -euo pipefail

REPO_ROOT=$(git rev-parse --show-toplevel)
source "${REPO_ROOT}/scripts/k8s-utils.sh"

SCRIPT_DIR="$(dirname "$0")"
cd "$SCRIPT_DIR"

echo "Deploying k8s event targets..."

kubectl patch -n collector-targets pod/pod-stuck-in-terminating -p '{"metadata":{"finalizers":null}}' &>/dev/null || true
kubectl delete --ignore-not-found=true namespace collector-targets 2>/dev/null || true
wait_for_cluster_resource_deleted namespace/collector-targets

wait_for_namespace_created collector-targets
wait_for_namespaced_resource_created collector-targets serviceaccount/default

kubectl apply -f provisioning_failure.yaml >/dev/null
kubectl apply -f running-pod-crash-loop-backoff.yaml >/dev/null
kubectl apply -f pending-pod-image-cannot-be-loaded.yaml >/dev/null
kubectl apply -f pending-pod-cannot-be-scheduled.yaml >/dev/null

echo "Finished deploying k8s event targets"
Loading