Skip to content

Commit

Permalink
feat(metrics): collect host metrics
Browse files Browse the repository at this point in the history
Use the hostmetrics receiver to collect host metrics from the underlying
node:
- system.cpu.utilization
- system.filesystem.utilization
- system.memory.utilization
  • Loading branch information
basti1302 committed Feb 5, 2025
1 parent e5ca830 commit 70293dc
Show file tree
Hide file tree
Showing 9 changed files with 223 additions and 43 deletions.
28 changes: 27 additions & 1 deletion cmd/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,7 @@ var (
setupLog = ctrl.Log.WithName("setup")

startupTasksK8sClient client.Client
isDocker bool
deploymentSelfReference *appsv1.Deployment
envVars environmentVariables

Expand Down Expand Up @@ -287,14 +288,19 @@ func main() {
if err = initStartupTasksK8sClient(&setupLog); err != nil {
os.Exit(1)
}
detectDocker(
ctx,
startupTasksK8sClient,
&setupLog,
)
if err = findDeploymentSelfReference(
ctx,
startupTasksK8sClient,
envVars.operatorNamespace,
envVars.deploymentName,
&setupLog,
); err != nil {
setupLog.Error(err, "The Dash0 operator manager process to lookup its own deployment.")
setupLog.Error(err, "The Dash0 operator manager process to lookup its own deployment failed.")
os.Exit(1)
}

Expand Down Expand Up @@ -613,6 +619,7 @@ func startDash0Controllers(
OTelCollectorNamePrefix: envVars.oTelCollectorNamePrefix,
OTelColResourceSpecs: oTelColResourceSpecs,
IsIPv6Cluster: isIPv6Cluster,
IsDocker: isDocker,
DevelopmentMode: developmentMode,
}
backendConnectionManager := &backendconnection.BackendConnectionManager{
Expand Down Expand Up @@ -741,6 +748,25 @@ func initStartupTasksK8sClient(logger *logr.Logger) error {
return nil
}

func detectDocker(
ctx context.Context,
k8sClient client.Client,
logger *logr.Logger,
) {
nodeList := &corev1.NodeList{}
err := k8sClient.List(ctx, nodeList, &client.ListOptions{Limit: 1})
if err != nil {
logger.Error(err, "cannot list nodes for container runtime detection")
// assume it's not Docker
return
}
for _, node := range nodeList.Items {
if strings.Contains(node.Status.NodeInfo.ContainerRuntimeVersion, "docker://") {
isDocker = true
}
}
}

func findDeploymentSelfReference(
ctx context.Context,
k8sClient client.Client,
Expand Down
51 changes: 37 additions & 14 deletions helm-chart/dash0-operator/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -473,11 +473,12 @@ By default, the operator collects metrics as follows:
* The operator collects node, pod, container, and volume metrics from the API server on
[kubelets](https://kubernetes.io/docs/concepts/architecture/#kubelet)
via the
[Kubelet Stats Receiver](https://github.com/open-telemetry/opentelemetry-collector-contrib/blob/main/receiver/kubeletstatsreceiver/README.md)
and cluster-level metrics from the Kubernetes API server
via the
[Kubernetes Cluster Receiver](https://github.com/open-telemetry/opentelemetry-collector-contrib/blob/main/receiver/k8sclusterreceiver/README.md)
This can be disabled per cluster by setting `kubernetesInfrastructureMetricsCollectionEnabled: false` in the Dash0
[Kubelet Stats Receiver](https://github.com/open-telemetry/opentelemetry-collector-contrib/blob/main/receiver/kubeletstatsreceiver/README.md),
cluster-level metrics from the Kubernetes API server via the
[Kubernetes Cluster Receiver](https://github.com/open-telemetry/opentelemetry-collector-contrib/blob/main/receiver/k8sclusterreceiver/README.md),
and system metrics from the underlying nodes via the
[Host Metrics Receiver](https://github.com/open-telemetry/opentelemetry-collector-contrib/blob/main/receiver/hostmetricsreceiver/README.md).
Collecting these metrics can be disabled per cluster by setting `kubernetesInfrastructureMetricsCollectionEnabled: false` in the Dash0
operator configuration resource (or setting the value `operator.kubernetesInfrastructureMetricsCollectionEnabled` to
`false` when deploying the operator configuration resource via the Helm chart).
* Namespace-scoped metrics (e.g. metrics related to a workload running in a specific namespace) will only be collected
Expand Down Expand Up @@ -955,30 +956,52 @@ You can work around this issue by one of the following methods:

## Notes on Running The Operator on Docker Desktop

Docker Desktop's Kubelet `/stats` endpoint uses a self-signed TLS certificate, which is not even signed by the cluster's certificate authority, and it looks invalid from the point of view of the OpenTelemetry collectors operated by the Dash0 operator.
Docker Desktop's Kubelet `/stats` endpoint uses a self-signed TLS certificate, which is not even signed by the cluster's
certificate authority, and it looks invalid from the point of view of the OpenTelemetry collectors operated by the Dash0
operator.

To ensure that you can still have a good experience trying out the Dash0 operator in a local environment, the Dash0 operator detects that is is running via the Kubernetes integrated in Docker Desktop (by matching the node name against `docker-desktop`) and disables the TLS verification for the receiver that retrieve metrics from the Kubelet `/stats` endpoint. If this is the case, you will see the following entry in the logs of the daemonset operated by the Dash0 operator:
To ensure that you can still have a good experience trying out the Dash0 operator in a local environment, the Dash0
operator detects that it is running via the Kubernetes integrated in Docker Desktop (by matching the node name against
`docker-desktop`) and disables the TLS verification for the receiver that retrieve metrics from the Kubelet `/stats`
endpoint. If this is the case, you will see the following entry in the logs of the daemonset operated by the Dash0
operator:

```
This collector seems to run on a node managed by Docker Desktop's Kubernetes, which is known to have self-signed CA certs for the Kubelet stats endpoint: disabling TLS verification for the kubeletstat receiver
```

Furthermore, the `hostmetrics` receiver will be disabled when using Docker as the container runtime.

## Notes on Running The Operator on Minikube

Docker Desktop's Kubelet `/stats` endpoint uses a self-signed TLS certificate, which is not even signed by the cluster's certificate authority, and it looks invalid from the point of view of the OpenTelemetry collectors operated by the Dash0 operator.
Docker Desktop's Kubelet `/stats` endpoint uses a self-signed TLS certificate, which is not even signed by the cluster's
certificate authority, and it looks invalid from the point of view of the OpenTelemetry collectors operated by the Dash0
operator.

To ensure that you can still have a good experience trying out the Dash0 operator in a local environment, the Dash0 operator detects that is is running via the Kubernetes integrated in Docker Desktop (by matching the node name against `minikube`) and disables the TLS verification for the receiver that retrieve metrics from the Kubelet `/stats` endpoint. If this is the case, you will see the following entry in the logs of the daemonset operated by the Dash0 operator:
To ensure that you can still have a good experience trying out the Dash0 operator in a local environment, the Dash0
operator detects that it is running via the Kubernetes integrated in Docker Desktop (by matching the node name against
`minikube`) and disables the TLS verification for the receiver that retrieve metrics from the Kubelet `/stats` endpoint.
If this is the case, you will see the following entry in the logs of the daemonset operated by the Dash0 operator:

```
This collector seems to run on a node managed by Minikube, which is known to have self-signed CA certs for the Kubelet stats endpoint: disabling TLS verification for the kubeletstat receiver
```

Note: if you use a minikube profile (`minikube -p <profile_name>`), the node name will change, and the automatic disablement of TLS verification will not work, resulting on missing metrics in Dash0 about your workloads.
If this is the case, you can run the Dash0 operator in development mode (which results, among other things, in disabling the TLS verification for the Kubelet `/stats` endpoint) by passing the `--set operator.developmentMode=true` configuration to Helm.
Note: if you use a minikube profile (`minikube -p <profile_name>`), the node name will change, and the automatic
disablement of TLS verification will not work, resulting on missing metrics in Dash0 about your workloads. If this is
the case, you can run the Dash0 operator in development mode (which results, among other things, in disabling the TLS
verification for the Kubelet `/stats` endpoint) by passing the `--set operator.developmentMode=true` configuration to
Helm.

Furthermore, the `hostmetrics` receiver will be disabled when using Docker as the container runtime.

## Notes on Running The Operator on Kind

Kind's Kubelet `/stats` endpoint uses a self-signed TLS certificate, which is not even signed by the cluster's certificate authority, and it looks invalid from the point of view of the OpenTelemetry collectors operated by the Dash0 operator.
Kind's Kubelet `/stats` endpoint uses a self-signed TLS certificate, which is not even signed by the cluster's
certificate authority, and it looks invalid from the point of view of the OpenTelemetry collectors operated by the Dash0
operator.

Unfortunately, unlike the cases for Docker Desktop and Minikube, there is no known heuristic for an operator to know whether it's running on a Kind cluster.
If you want to collect Kubelet metrics from a Kind cluster, you will need to run the Dash0 operator in development mode (which results, among other things, in disabling the TLS verification for the Kubelet `/stats` endpoint) by passing the `--set operator.developmentMode=true` configuration to Helm.
Unfortunately, unlike the cases for Docker Desktop and Minikube, there is no known heuristic for an operator to know
whether it's running on a Kind cluster. If you want to collect Kubelet metrics from a Kind cluster, you will need to
run the Dash0 operator in development mode (which results, among other things, in disabling the TLS verification for the
Kubelet `/stats` endpoint) by passing the `--set operator.developmentMode=true` configuration to Helm.
1 change: 1 addition & 0 deletions images/collector/src/builder/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ exporters:
receivers:
- gomod: "go.opentelemetry.io/collector/receiver/otlpreceiver v0.111.0"
- gomod: "github.com/open-telemetry/opentelemetry-collector-contrib/receiver/filelogreceiver v0.111.0"
- gomod: "github.com/open-telemetry/opentelemetry-collector-contrib/receiver/hostmetricsreceiver v0.111.0"
- gomod: "github.com/open-telemetry/opentelemetry-collector-contrib/receiver/k8sclusterreceiver v0.111.0"
- gomod: "github.com/open-telemetry/opentelemetry-collector-contrib/receiver/kubeletstatsreceiver v0.111.0"
- gomod: "github.com/open-telemetry/opentelemetry-collector-contrib/receiver/prometheusreceiver v0.111.0"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ type collectorConfigurationTemplateValues struct {
Exporters []OtlpExporter
IgnoreLogsFromNamespaces []string
KubernetesInfrastructureMetricsCollectionEnabled bool
UseHostMetricsReceiver bool
ClusterName string
NamespaceOttlFilter string
NamespacesWithPrometheusScraping []string
Expand Down Expand Up @@ -117,11 +118,12 @@ func assembleCollectorConfigMap(
config.Namespace,
},
KubernetesInfrastructureMetricsCollectionEnabled: config.KubernetesInfrastructureMetricsCollectionEnabled,
ClusterName: config.ClusterName,
NamespaceOttlFilter: namespaceOttlFilter,
NamespacesWithPrometheusScraping: namespacesWithPrometheusScraping,
SelfIpReference: selfIpReference,
DevelopmentMode: config.DevelopmentMode,
UseHostMetricsReceiver: config.UseHostMetricsReceiver,
ClusterName: config.ClusterName,
NamespaceOttlFilter: namespaceOttlFilter,
NamespacesWithPrometheusScraping: namespacesWithPrometheusScraping,
SelfIpReference: selfIpReference,
DevelopmentMode: config.DevelopmentMode,
})
if err != nil {
return nil, fmt.Errorf("cannot render the collector configuration template: %w", err)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -661,30 +661,35 @@ var _ = Describe("The OpenTelemetry Collector ConfigMaps", func() {
Expect(selfMonitoringTelemetryResource.(map[string]interface{})["k8s.cluster.name"]).To(Equal("cluster-name"))
}, testConfigs)

Describe("should enable/disable kubernetes infrastructure metrics collection", func() {
It("should not render the kubeletstats receiver if kubernetes infrastructure metrics collection is disabled", func() {
Describe("should enable/disable kubernetes infrastructure metrics collection and the hostmetrics receiver", func() {
It("should not render the kubeletstats receiver and hostmetrics if kubernetes infrastructure metrics collection is disabled", func() {
configMap, err := assembleDaemonSetCollectorConfigMap(&oTelColConfig{
Namespace: namespace,
NamePrefix: namePrefix,
Export: Dash0ExportWithEndpointAndToken(),
KubernetesInfrastructureMetricsCollectionEnabled: false,
UseHostMetricsReceiver: false,
}, nil, nil, false)
Expect(err).ToNot(HaveOccurred())
collectorConfig := parseConfigMapContent(configMap)
kubeletstatsReceiver := readFromMap(collectorConfig, []string{"receivers", "kubeletstats"})
Expect(kubeletstatsReceiver).To(BeNil())
hostmetricsReceiver := readFromMap(collectorConfig, []string{"receivers", "hostmetrics"})
Expect(hostmetricsReceiver).To(BeNil())

pipelines := readPipelines(collectorConfig)
metricsReceivers := readPipelineReceivers(pipelines, "metrics/downstream")
Expect(metricsReceivers).ToNot(ContainElement("kubeletstats"))
Expect(metricsReceivers).ToNot(ContainElement("hostmetrics"))
})

It("should render the kubeletstats receiver if kubernetes infrastructure metrics collection is enabled", func() {
It("should render the kubeletstats and hostmetrics receiver if kubernetes infrastructure metrics collection is enabled", func() {
configMap, err := assembleDaemonSetCollectorConfigMap(&oTelColConfig{
Namespace: namespace,
NamePrefix: namePrefix,
Export: Dash0ExportWithEndpointAndToken(),
KubernetesInfrastructureMetricsCollectionEnabled: true,
UseHostMetricsReceiver: true,
}, nil, nil, false)
Expect(err).ToNot(HaveOccurred())
collectorConfig := parseConfigMapContent(configMap)
Expand All @@ -694,10 +699,13 @@ var _ = Describe("The OpenTelemetry Collector ConfigMaps", func() {
insecureSkipVerifyPropertyValue, hasInsecureSkipVerifyProperty := kubeletstatsReceiver["insecure_skip_verify"]
Expect(hasInsecureSkipVerifyProperty).To(BeTrue())
Expect(insecureSkipVerifyPropertyValue).To(Equal("${env:KUBELET_STATS_TLS_INSECURE}"))
hostmetricsReceiver := readFromMap(collectorConfig, []string{"receivers", "hostmetrics"})
Expect(hostmetricsReceiver).ToNot(BeNil())

pipelines := readPipelines(collectorConfig)
metricsReceivers := readPipelineReceivers(pipelines, "metrics/downstream")
Expect(metricsReceivers).To(ContainElement("kubeletstats"))
Expect(metricsReceivers).To(ContainElement("hostmetrics"))
})
})

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -171,7 +171,77 @@ authority"}

This env var is set in the collector image's entrypoint script or in the daemonset spec if DevelopmentMode is active */}}
insecure_skip_verify: ${env:KUBELET_STATS_TLS_INSECURE}
{{- end }}
{{- end }}{{/* if .KubernetesInfrastructureMetricsCollectionEnabled */}}

{{- if .UseHostMetricsReceiver }}
hostmetrics:
collection_interval: 60s
root_path: /hostfs
scrapers:
cpu:
metrics:
# disable all metrics that are enabled by default (there is only one)
system.cpu.time:
enabled: false
# Explicitly enable the cpu utilization metric
system.cpu.utilization:
enabled: true
filesystem:
metrics:
# disable all metrics that are enabled by default
system.filesystem.inodes.usage:
enabled: false
system.filesystem.usage:
enabled: false
system.filesystem.utilization:
enabled: true
{{- /*
exclude_fs_types and exclude_mount_points are taken from collector's official helm chart:
https://github.com/open-telemetry/opentelemetry-helm-charts/blob/f75b282f2867f1c9689ba9e28083cba15a98b66b/charts/opentelemetry-collector/templates/_config.tpl#L69-L115
*/}}
exclude_fs_types:
fs_types:
- autofs
- binfmt_misc
- bpf
- cgroup2
- configfs
- debugfs
- devpts
- devtmpfs
- fusectl
- hugetlbfs
- iso9660
- mqueue
- nsfs
- overlay
- proc
- procfs
- pstore
- rpc_pipefs
- securityfs
- selinuxfs
- squashfs
- sysfs
- tracefs
match_type: strict
exclude_mount_points:
match_type: regexp
mount_points:
- /dev/*
- /proc/*
- /sys/*
- /run/k3s/containerd/*
- /var/lib/docker/*
- /var/lib/kubelet/*
- /snap/*
memory:
metrics:
system.memory.usage:
enabled: false
system.memory.utilization:
enabled: true
{{- end }}{{/* if .UseHostMetricsReceiver */}}

{{- $hasPrometheusScrapingEnabledForAtLeastOneNamespace := gt (len .NamespacesWithPrometheusScraping) 0 }}

Expand Down Expand Up @@ -376,6 +446,9 @@ service:
{{- if .KubernetesInfrastructureMetricsCollectionEnabled }}
- kubeletstats
{{- end }}
{{- if .UseHostMetricsReceiver }}
- hostmetrics
{{- end }}
{{- if $hasPrometheusScrapingEnabledForAtLeastOneNamespace }}
- prometheus
{{- end }}
Expand Down
Loading

0 comments on commit 70293dc

Please sign in to comment.