Skip to content

Commit

Permalink
remove pod-stuck in terminating and rename important event annotation
Browse files Browse the repository at this point in the history
Signed-off-by: Jerry Belmonte <[email protected]>
Co-authored-by: Jerry Belmonte <[email protected]>
Co-authored-by: Yuqi Jin <[email protected]>
  • Loading branch information
3 people committed Nov 14, 2023
1 parent 1791ece commit 1de4627
Show file tree
Hide file tree
Showing 8 changed files with 9 additions and 72 deletions.
3 changes: 1 addition & 2 deletions collector/hack/test/files/all-metrics.jsonl
Original file line number Diff line number Diff line change
Expand Up @@ -356,7 +356,6 @@
{"Name":"kubernetes.pod.uptime","Tags":{"cluster":"","label.k8s-app":"prom-example","label.name":"prom-example","namespace_name": "collector-targets","nodename":"","pod_name":"","source":"","type":"pod"}}
{"Name":"kubernetes.pod.uptime","Tags":{"cluster":"","label.k8s-app":"wavefront-collector","namespace_name":"wavefront-collector","nodename":"","pod_name":"","source":"","type":"pod"}}
{"Name":"kubernetes.pod.uptime","Tags":{"cluster":"","namespace_name":"","nodename":"","pod_name":"","source":"","type":"pod"}}
{"Name":"kubernetes.pod.terminating","Value":"1","Tags":{"cluster":"","namespace_name":"collector-targets","nodename":"","pod_name":"pod-stuck-in-terminating","source":"","type":"pod", "DeletionTimestamp":"","workload_name":"","reason": ""}}
{"Name":"kubernetes.pod_container.cpu.limit","Tags":{"cluster":"","container_base_image":"","container_name":"memcached","label.app.kubernetes.io/instance":"memcached-release","label.app.kubernetes.io/managed-by":"Helm","label.app.kubernetes.io/name":"memcached","label.helm.sh/chart":"","namespace_name": "collector-targets","nodename":"","pod_name":"","source":"","type":"pod_container","workload_name":"","workload_kind":""}}
{"Name":"kubernetes.pod_container.cpu.limit","Tags":{"cluster":"","container_base_image":"","container_name":"mysql","label.app.kubernetes.io/component":"primary","label.app.kubernetes.io/instance":"mysql-release","label.app.kubernetes.io/managed-by":"Helm","label.app.kubernetes.io/name":"mysql","label.helm.sh/chart":"","label.statefulset.kubernetes.io/pod-name":"mysql-release-0","namespace_name": "collector-targets","nodename":"","pod_name":"","source":"","type":"pod_container"}}
{"Name":"kubernetes.pod_container.cpu.limit", "Value":"200", "Tags":{"cluster":"","container_base_image":"","container_name":"prom-example","label.k8s-app":"prom-example","label.name":"prom-example","namespace_name": "collector-targets","nodename":"","pod_name":"","source":"","type":"pod_container"}}
Expand Down Expand Up @@ -523,7 +522,7 @@
{"Name":"kubernetes.workload.status","Value":"0","Tags":{"cluster":"","namespace_name":"collector-targets","source":"","workload_name":"pod-cannot-be-scheduled","workload_kind":"Pod","available":"0","desired":"1","reason":"Unschedulable","message":"","!type":""}}
{"Name":"kubernetes.workload.status","Value":"0","Tags":{"cluster":"","namespace_name":"collector-targets","source":"","workload_name":"pod-crash-loop-backoff","workload_kind":"Pod","available":"0","desired":"1","reason":"CrashLoopBackOff","message":"","!type":""}}
{"Name":"kubernetes.workload.status","Value":"0","Tags":{"cluster":"","namespace_name":"collector-targets","source":"","workload_name":"pod-image-cannot-be-loaded","workload_kind":"Pod","available":"0","desired":"1","reason":"ImagePullBackOff","message":"","!type":""}}
{"Name":"kubernetes.workload.status","Value":"0","Tags":{"cluster":"","namespace_name":"collector-targets","source":"","workload_name":"pod-stuck-in-terminating","workload_kind":"Pod","available":"0","desired":"1","reason":"Terminating","message":"","!type":""}}
{"Name":"kubernetes.workload.status","Value":"0","Tags":{"cluster":"","namespace_name":"collector-targets","source":"","workload_name":"pod-stuck-in-terminating","workload_kind":"Pod","available":"0","desired":"1","reason":"Error","!type":""}}
{"Name":"kubernetes.workload.status","Value":"1","Tags":{"cluster":"","namespace_name":"collector-targets","source":"","workload_name":"pod-large-init-container","workload_kind":"Pod","available":"1","desired":"1","!type":""}}
{"Name":"kubernetes.workload.status","Value":"1","Tags":{"cluster":"","namespace_name":"collector-targets","source":"","workload_name":"pod-small-init-container","workload_kind":"Pod","available":"1","desired":"1","!type":""}}
{"Name":"kubernetes.workload.status","Value":"1","Tags":{"cluster":"","namespace_name":"collector-targets","source":"","workload_name":"prom-example","workload_kind":"Pod","available":"1","desired":"1","!type":""}}
Expand Down
18 changes: 2 additions & 16 deletions collector/plugins/events/eventannotater.go
Original file line number Diff line number Diff line change
Expand Up @@ -42,10 +42,10 @@ func (em *eventMatcher) categorize(event *v1.Event) bool {
if em.match(event) {
event.ObjectMeta.Annotations["aria/category"] = em.getCategory(event)
event.ObjectMeta.Annotations["aria/subcategory"] = em.getSubcategory(event)
event.ObjectMeta.Annotations["internal/important"] = "true"
event.ObjectMeta.Annotations["important"] = "true"
return true
} else {
event.ObjectMeta.Annotations["internal/important"] = "false"
event.ObjectMeta.Annotations["important"] = "false"
return false
}
}
Expand Down Expand Up @@ -182,20 +182,6 @@ func (ea *EventAnnotator) runtimeMatchers() []eventMatcher {
category: Runtime,
subcategory: OOMKilled,
},
{
match: func(event *v1.Event) bool {
if event.Type == v1.EventTypeNormal && event.Reason == "Killing" {
pod, err := ea.workloadCache.GetPod(event.InvolvedObject.Name, event.InvolvedObject.Namespace)
if err != nil {
return false
}
return util.IsStuckInTerminating(pod)
}
return false
},
category: Runtime,
subcategory: Terminating,
},
}
}

Expand Down
19 changes: 1 addition & 18 deletions collector/plugins/events/eventannotater_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@ import (
"testing"

"github.com/wavefronthq/observability-for-kubernetes/collector/internal/testhelper"
"github.com/wavefronthq/observability-for-kubernetes/collector/internal/util"
v1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/util/yaml"
Expand Down Expand Up @@ -54,12 +53,6 @@ func TestCategorizeMatching(t *testing.T) {
validateCategorySubcategory(t, "examples/unhealthy.yaml", Runtime, Unhealthy, "true")
})

t.Run("Pod stuck in Terminating", func(t *testing.T) {
ea := setupAnnotator(t)
ea.workloadCache = testhelper.NewFakeWorkloadCache("some-workload-name", "some-workload-kind", "some-node-name", util.GetPodStuckInTerminating())
validateAnnotations(t, ea, "examples/pod_terminating.yaml", Runtime, Terminating, "true")
})

t.Run("Out-of-memory killed", func(t *testing.T) {
validateCategorySubcategory(t, "examples/oom_killed.yaml", Runtime, OOMKilled, "true")
})
Expand Down Expand Up @@ -92,16 +85,6 @@ func TestCategorizeNonMatching(t *testing.T) {
validateCategorySubcategory(t, "examples/normal_pulling_image.yaml", "", "", "false")
})

t.Run("Pod Terminating Gracefully", func(t *testing.T) {
ea := setupAnnotator(t)
ea.workloadCache = testhelper.NewFakeWorkloadCache("some-workload-name", "some-workload-kind", "some-node-name", fakePod())
validateAnnotations(t, ea, "examples/pod_terminating.yaml", "", "", "false")
})

t.Run("Pod Terminating event, pod not found", func(t *testing.T) {
validateCategorySubcategory(t, "examples/pod_terminating.yaml", "", "", "false")
})

}

func validateCategorySubcategory(t *testing.T, file, category, subcategory, important string) {
Expand All @@ -116,7 +99,7 @@ func validateAnnotations(t *testing.T, ea *EventAnnotator, file, category, subca
ea.annotate(&event)
require.Equal(t, category, event.ObjectMeta.Annotations["aria/category"])
require.Equal(t, subcategory, event.ObjectMeta.Annotations["aria/subcategory"])
require.Equal(t, important, event.ObjectMeta.Annotations["internal/important"])
require.Equal(t, important, event.ObjectMeta.Annotations["important"])
}
}

Expand Down
4 changes: 2 additions & 2 deletions collector/plugins/events/eventrouter.go
Original file line number Diff line number Diff line change
Expand Up @@ -146,7 +146,7 @@ func (er *EventRouter) addEvent(obj interface{}, isInInitialList bool) {
"reason": e.Reason,
"component": e.Source.Component,
"type": e.Type,
"important": e.Annotations["internal/important"],
"important": e.Annotations["important"],
}

resourceName := e.InvolvedObject.Name
Expand All @@ -166,7 +166,7 @@ func (er *EventRouter) addEvent(obj interface{}, isInInitialList bool) {
filteredEvents.Inc(1)
return
}
delete(e.Annotations, "internal/important")
delete(e.Annotations, "important")
delete(tags, "important")
sentEvents.Inc(1)

Expand Down
2 changes: 1 addition & 1 deletion collector/plugins/events/eventrouter_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,7 @@ func TestAddEvent(t *testing.T) {
event.Type = v1.EventTypeWarning
er.addEvent(event, false)
require.Empty(t, sink.Annotations["important"])
require.Empty(t, event.Annotations["internal/important"])
require.Empty(t, event.Annotations["important"])
})
}

Expand Down
33 changes: 0 additions & 33 deletions collector/plugins/events/examples/pod_terminating.yaml

This file was deleted.

1 change: 1 addition & 0 deletions collector/plugins/processors/pod_based_enricher.go
Original file line number Diff line number Diff line change
Expand Up @@ -228,6 +228,7 @@ func (pbe *PodBasedEnricher) addWorkloadStatusMetric(podMs *metrics.Set, pod *ku
} else if containerStatus.State.Terminated != nil {
workloadStatus = 0
workloadMs.Labels[metrics.LabelReason.Key] = containerStatus.State.Terminated.Reason
// TODO: add message for pod stuck in terminating
workloadMs.Labels[metrics.LabelMessage.Key] = containerStatus.State.Terminated.Message
break
}
Expand Down
1 change: 1 addition & 0 deletions collector/plugins/sources/kstate/non_running_pods.go
Original file line number Diff line number Diff line change
Expand Up @@ -163,6 +163,7 @@ func buildPodTerminatingMetrics(pod *v1.Pod, sharedTags map[string]string, trans
tags["reason"] = "Terminating"

for _, condition := range pod.Status.Conditions {
// TODO: remove terminating logic from non_running_pods and put them under pod_based_enricher
if condition.Type == v1.PodScheduled && condition.Status == "False" {
tags[metrics.LabelNodename.Key] = "none"
}
Expand Down

0 comments on commit 1de4627

Please sign in to comment.