From 32863fdcc3791d0df9b517de7715cdce5c3c57b7 Mon Sep 17 00:00:00 2001 From: Scott Trent Date: Fri, 15 Mar 2024 14:45:52 +0900 Subject: [PATCH] enable customization of Kepler Metric Signed-off-by: Scott Trent --- README.md | 22 +++++++++++++++++++ cmd/main.go | 5 ++++- deployment/deploy.sh | 9 ++++++++ .../templates/deployment.yaml | 1 + deployment/susql-controller/values.yaml | 1 + internal/controller/labelgroup_controller.go | 4 ++-- 6 files changed, 39 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index fa9c768..5c8daed 100644 --- a/README.md +++ b/README.md @@ -84,3 +84,25 @@ Energy of the group of pods is exposed in 2 ways: * Through Prometheus at `http://prometheus-susql.openshift-kepler-operator.svc.cluster.local:9090` using the query `susql_total_energy_joules{susql_label_1=my-label-1,susql_label_2=my-label-2}` * From `status` of the `LabelGroup` CRD given as `labelgroup.status.totalEnergy` + +### Installation Configuration Options + +|----------------------------|----------------------------|--------------------------------------------| +| Environmental Variable | Default Value | Description | +| SUSQL_NAMESPACE | openshift-kepler-operator | namespace that SUSQL resources run in | +| KEPLER_PROMETHEUS_NAMESPACE | openshift-monitoring | namespace that Kepler Prometheus runs in | +| PROMETHEUS_PROTOCOL | http | Either http or https for Kepler Prometheus access | +| PROMETHEUS_SERVICE | prometheus-k8s | service name for the Kepler Prometheus | +| PROMETHEUS_NAMESPACE | monitoring | namespace used by the Kepler Prometheus | +| PROMETHEUS_DOMAIN | svc.cluster.local | Domain used by the Kepler Prometheus | +| PROMETHEUS_PORT | 9090 | Port used by the Kepler Prometheu | +| KEPLER_PROMETHEUS_URL | http://prometheus-k8s.monitoring.svc.cluster.local:9090 | A shortcut to specify final Kepler Prometheus URL | +| KEPLER_METRIC_NAME | kepler_container_joules_total | Metric queried in the Kepler Prometheus | +| SUSQL_PROMETHEUS_URL | http://prometheus-susql.openshift-kepler-operator.svc.cluster.local:9090 | SusQL Prometheus URL | +| SUSQL_SAMPLING_RATE | 2 | Sampling rate in seconds | +| SUSQL_ENHANCED | | If set to any string, then use enhanced RBAC and SMON configuration | +| SUSQL_REGISTRY | quay.io/sustainable_computing_io | Container registry that SusQL is stored in | +| SUSQL_IMAGE_NAME | susql_operator | Image name used on SusQL container registry | +| SUSQL_IMAGE_TAG | latest | Tag for SusQL container | +|----------------------------|----------------------------|---------------------------------------------| + diff --git a/cmd/main.go b/cmd/main.go index 4697a10..e923eea 100644 --- a/cmd/main.go +++ b/cmd/main.go @@ -55,6 +55,7 @@ func main() { var enableLeaderElection bool var probeAddr string var keplerPrometheusUrl string + var keplerMetricName string var susqlPrometheusMetricsUrl string var susqlPrometheusDatabaseUrl string var samplingRate string @@ -63,9 +64,10 @@ func main() { flag.StringVar(&probeAddr, "health-probe-bind-address", ":8081", "The address the probe endpoint binds to.") flag.BoolVar(&enableLeaderElection, "leader-elect", false, "Enable leader election for controller manager. Enabling this will ensure there is only one active controller manager.") flag.StringVar(&keplerPrometheusUrl, "kepler-prometheus-url", "", "The URL for the Prometheus server where Kepler stores the energy data") + flag.StringVar(&keplerMetricName, "kepler-metric-name", "kepler_container_joules_total", "The metric name to be queried in the kepler Prometheus server") flag.StringVar(&susqlPrometheusDatabaseUrl, "susql-prometheus-database-url", "", "The URL for the Prometheus database where SusQL stores the energy data") flag.StringVar(&susqlPrometheusMetricsUrl, "susql-prometheus-metrics-url", "", "The URL for the Prometheus metrics where SusQL exposes the energy data") - flag.StringVar(&samplingRate, "sampling-rate", "", "Sampling rate in seconds") + flag.StringVar(&samplingRate, "sampling-rate", "2", "Sampling rate in seconds") opts := zap.Options{ Development: true, @@ -108,6 +110,7 @@ func main() { Client: mgr.GetClient(), Scheme: mgr.GetScheme(), KeplerPrometheusUrl: keplerPrometheusUrl, + KeplerMetricName: keplerMetricName, SusQLPrometheusDatabaseUrl: susqlPrometheusDatabaseUrl, SusQLPrometheusMetricsUrl: susqlPrometheusMetricsUrl, SamplingRate: time.Duration(samplingRateInteger) * time.Second, diff --git a/deployment/deploy.sh b/deployment/deploy.sh index 1ae9c50..a6f1778 100644 --- a/deployment/deploy.sh +++ b/deployment/deploy.sh @@ -57,6 +57,10 @@ if [[ -z ${KEPLER_PROMETHEUS_URL} ]]; then KEPLER_PROMETHEUS_URL="${PROMETHEUS_PROTOCOL}://${PROMETHEUS_SERVICE}.${PROMETHEUS_NAMESPACE}.${PROMETHEUS_DOMAIN}:${PROMETHEUS_PORT}" fi +if [[ -z ${KEPLER_METRIC_NAME} ]]; then + KEPLER_METRIC_NAME="kepler_container_joules_total" +fi + if [[ -z ${SUSQL_PROMETHEUS_URL} ]]; then if [[ -z ${SHARED_PROMETHEUS} ]]; then # using separate prometheus instance @@ -102,6 +106,7 @@ echo "PROMETHEUS_NAMESPACE - '${PROMETHEUS_NAMESPACE}'" echo "PROMETHEUS_DOMAIN - '${PROMETHEUS_DOMAIN}'" echo "PROMETHEUS_PORT - '${PROMETHEUS_PORT}'" echo "KEPLER_PROMETHEUS_URL - '${KEPLER_PROMETHEUS_URL}'" +echo "KEPLER_METRIC_NAME - '${KEPLER_METRIC_NAME}'" echo "SUSQL_PROMETHEUS_URL - '${SUSQL_PROMETHEUS_URL}'" echo "SUSQL_SAMPLING_RATE - '${SUSQL_SAMPLING_RATE}'" echo "SUSQL_ENHANCED - '${SUSQL_ENHANCED}'" @@ -112,6 +117,8 @@ echo "========================================================================== # Actions to perform, separated by comma actions=${1:-"kepler-check,prometheus-undeploy,prometheus-deploy,susql-undeploy,susql-deploy"} +exit + # output deploy information LOGFILE=.susql-deploy-info.txt LASTLOGFILE=.susql-deploy-info-last.txt @@ -130,6 +137,7 @@ echo "export PROMETHEUS_NAMESPACE=${PROMETHEUS_NAMESPACE}" >> ${LOGFILE} echo "export PROMETHEUS_DOMAIN=${PROMETHEUS_DOMAIN}" >> ${LOGFILE} echo "export PROMETHEUS_PORT=${PROMETHEUS_PORT}" >> ${LOGFILE} echo "export KEPLER_PROMETHEUS_URL=${KEPLER_PROMETHEUS_URL}" >> ${LOGFILE} +echo "export KEPLER_METRIC_NAME=${KEPLER_METRIC_NAME}" >> ${LOGFILE} echo "export SUSQL_PROMETHEUS_URL=${SUSQL_PROMETHEUS_URL}" >> ${LOGFILE} echo "export SUSQL_SAMPLING_RATE=${SUSQL_SAMPLING_RATE}" >> ${LOGFILE} echo "export SUSQL_ENHANCED=${SUSQL_ENHANCED}" >> ${LOGFILE} @@ -219,6 +227,7 @@ do cd - helm upgrade --install --wait susql-controller ${SUSQL_DIR}/deployment/susql-controller --namespace ${SUSQL_NAMESPACE} \ --set keplerPrometheusUrl="${KEPLER_PROMETHEUS_URL}" \ + --set keplerMetricName="${KEPLER_METRIC_NAME}" \ --set susqlPrometheusDatabaseUrl="${SUSQL_PROMETHEUS_URL}" \ --set susqlPrometheusMetricsUrl="http://0.0.0.0:8082" \ --set samplingRate="${SUSQL_SAMPLING_RATE}" \ diff --git a/deployment/susql-controller/templates/deployment.yaml b/deployment/susql-controller/templates/deployment.yaml index 7aa280a..cc059b3 100644 --- a/deployment/susql-controller/templates/deployment.yaml +++ b/deployment/susql-controller/templates/deployment.yaml @@ -25,6 +25,7 @@ spec: imagePullPolicy: {{ .Values.imagePullPolicy | default "IfNotPresent" }} args: - "--kepler-prometheus-url={{ .Values.keplerPrometheusUrl }}" + - "--kepler-metric-name={{ .Values.KeplerMetricName }}" - "--susql-prometheus-database-url={{ .Values.susqlPrometheusDatabaseUrl }}" - "--susql-prometheus-metrics-url={{ .Values.susqlPrometheusMetricsUrl }}" - "--sampling-rate={{ .Values.samplingRate }}" diff --git a/deployment/susql-controller/values.yaml b/deployment/susql-controller/values.yaml index 5275d6c..f1d7638 100644 --- a/deployment/susql-controller/values.yaml +++ b/deployment/susql-controller/values.yaml @@ -20,6 +20,7 @@ requests: # Communication with Kepler and Prometheus ##################### keplerPrometheusUrl: "http://prometheus-k8s.monitoring.svc.cluster.local:9090" +keplerMetricName: "kepler_container_joules_total" susqlPrometheusDatabaseUrl: "http://prometheus-susql.openshift-kepler-operator.svc.cluster.local:9090" susqlPrometheusMetricsUrl: "http://0.0.0.0:8082" samplingRate: "2" diff --git a/internal/controller/labelgroup_controller.go b/internal/controller/labelgroup_controller.go index 4304f7d..9f7c861 100644 --- a/internal/controller/labelgroup_controller.go +++ b/internal/controller/labelgroup_controller.go @@ -35,13 +35,13 @@ type LabelGroupReconciler struct { client.Client Scheme *runtime.Scheme KeplerPrometheusUrl string + KeplerMetricName string SusQLPrometheusDatabaseUrl string SusQLPrometheusMetricsUrl string SamplingRate time.Duration // Sampling rate for all the label groups } const ( - keplerMetricName = "kepler_container_joules_total" // Kepler metric to query susqlMetricName = "susql_total_energy_joules" // SusQL metric to query fixingDelay = 15 * time.Second // Time to wait in the even the label group was badly constructed errorDelay = 1 * time.Second // Time to wait when an error happens due to network connectivity issues @@ -175,7 +175,7 @@ func (r *LabelGroupReconciler) Reconcile(ctx context.Context, req ctrl.Request) } // Aggregate Kepler measurements for these set of pods - metricValues, err := r.GetMetricValuesForPodNames(keplerMetricName, podNames, namespaceNames) + metricValues, err := r.GetMetricValuesForPodNames(r.KeplerMetricName, podNames, namespaceNames) if err != nil { fmt.Printf("ERROR [Reconcile]: Querying Prometheus didn't work: %v\n", err)