From 608c7a9f0af29081dca3e4e70fa03f3299fbc3d2 Mon Sep 17 00:00:00 2001 From: Scott Trent Date: Tue, 4 Jun 2024 13:54:26 +0900 Subject: [PATCH 1/3] test tools Signed-off-by: Scott Trent --- test/clean.sh | 4 ++++ test/start.sh | 5 +++++ 2 files changed, 9 insertions(+) create mode 100644 test/clean.sh create mode 100644 test/start.sh diff --git a/test/clean.sh b/test/clean.sh new file mode 100644 index 0000000..be672f1 --- /dev/null +++ b/test/clean.sh @@ -0,0 +1,4 @@ +oc delete -f labelgroups.yaml +oc delete -f energy-consumer-job.yaml +oc delete -f training-job-1.yaml +oc delete -f training-job-2.yaml diff --git a/test/start.sh b/test/start.sh new file mode 100644 index 0000000..0e7a3c8 --- /dev/null +++ b/test/start.sh @@ -0,0 +1,5 @@ +oc apply -f labelgroups.yaml +sleep 10 +oc apply -f energy-consumer-job.yaml +oc apply -f training-job-1.yaml +oc apply -f training-job-2.yaml From 7a810423c8ef7f72d55c5f0fb1983042977216cb Mon Sep 17 00:00:00 2001 From: Scott Trent Date: Wed, 5 Jun 2024 15:40:58 +0900 Subject: [PATCH 2/3] support Kepler idle data Signed-off-by: Scott Trent --- VERSION | 2 +- .../susql-operator.clusterserviceversion.yaml | 12 +-- cmd/main.go | 2 +- config/manager/kustomization.yaml | 2 +- config/rbac/susql-rbac2.yaml | 73 +++++++++++++++++++ internal/controller/labelgroup_controller.go | 16 ++-- internal/controller/prometheus_manager.go | 14 +++- 7 files changed, 102 insertions(+), 19 deletions(-) create mode 100644 config/rbac/susql-rbac2.yaml diff --git a/VERSION b/VERSION index e3b86dd..cd23180 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -0.0.16 +0.0.17 diff --git a/bundle/manifests/susql-operator.clusterserviceversion.yaml b/bundle/manifests/susql-operator.clusterserviceversion.yaml index 1dc23d9..46cf415 100644 --- a/bundle/manifests/susql-operator.clusterserviceversion.yaml +++ b/bundle/manifests/susql-operator.clusterserviceversion.yaml @@ -22,14 +22,14 @@ metadata: ] capabilities: Basic Install categories: Monitoring - containerImage: quay.io/sustainable_computing_io/susql_operator:0.0.16 - createdAt: "2024-05-30T08:31:43Z" + containerImage: quay.io/sustainable_computing_io/susql_operator:0.0.17 + createdAt: "2024-06-05T06:38:56Z" description: 'Aggregates energy data from pods tagged with SusQL labels ' operators.operatorframework.io/builder: operator-sdk-v1.34.1 operators.operatorframework.io/project_layout: go.kubebuilder.io/v4 repository: https://github.com/sustainable-computing-io/susql-operator support: https://github.com/sustainable-computing-io/susql-operator/issues - name: susql-operator.v0.0.16 + name: susql-operator.v0.0.17 namespace: placeholder spec: apiservicedefinitions: {} @@ -50,7 +50,7 @@ spec: ### Prerequisites - 1. Deployment of Kepler on the cluster + 1. Deployment of [Kepler](https://sustainable-computing.io/) on the cluster 2. Ensure that [User Project Monitoring](https://docs.openshift.com/container-platform/latest/monitoring/enabling-monitoring-for-user-defined-projects.html) is enabled to monitor energy consumed in user projects. @@ -216,7 +216,7 @@ spec: value: http://0.0.0.0:8082 - name: SAMPLING-RATE value: "2" - image: quay.io/sustainable_computing_io/susql_operator:0.0.16 + image: quay.io/sustainable_computing_io/susql_operator:0.0.17 imagePullPolicy: IfNotPresent livenessProbe: httpGet: @@ -320,4 +320,4 @@ spec: provider: name: SusQL Operator Contributors url: https://github.com/sustainable-computing-io/susql-operator - version: 0.0.16 + version: 0.0.17 diff --git a/cmd/main.go b/cmd/main.go index 687ad1e..ec07534 100644 --- a/cmd/main.go +++ b/cmd/main.go @@ -82,7 +82,7 @@ func main() { opts := zap.Options{ Development: true, - Level: zapcore.Level(-2), + Level: zapcore.Level(-5), } opts.BindFlags(flag.CommandLine) flag.Parse() diff --git a/config/manager/kustomization.yaml b/config/manager/kustomization.yaml index 5c0ede3..0070310 100644 --- a/config/manager/kustomization.yaml +++ b/config/manager/kustomization.yaml @@ -5,4 +5,4 @@ kind: Kustomization images: - name: controller newName: quay.io/sustainable_computing_io/susql_operator - newTag: 0.0.16 + newTag: 0.0.17 diff --git a/config/rbac/susql-rbac2.yaml b/config/rbac/susql-rbac2.yaml new file mode 100644 index 0000000..5f1d49c --- /dev/null +++ b/config/rbac/susql-rbac2.yaml @@ -0,0 +1,73 @@ +# bind susql service account to prometheus clusterrole +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + labels: + app.kubernetes.io/component: prometheus + app.kubernetes.io/instance: k8s + app.kubernetes.io/name: prometheus + name: susql-operatorprometheus-k8s +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: prometheus-k8s +subjects: +- kind: ServiceAccount + name: susql-controller-manager + namespace: openshift-operators +--- +# allow susql service account to access metrics on openshift-kepler-operator (might not need due to the same namespace) +kind: Role +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: prometheus-k8s + namespace: openshift-operators + labels: + app.kubernetes.io/component: prometheus + app.kubernetes.io/instance: k8s + app.kubernetes.io/name: prometheus +rules: + - verbs: + - get + - list + - watch + apiGroups: + - '' + resources: + - services + - endpoints + - pods + - verbs: + - get + - list + - watch + apiGroups: + - extensions + resources: + - ingresses + - verbs: + - get + - list + - watch + apiGroups: + - networking.k8s.io + resources: + - ingresses +--- +kind: RoleBinding +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: susql-prometheus-k8s + namespace: openshift-operators + labels: + app.kubernetes.io/component: prometheus + app.kubernetes.io/instance: k8s + app.kubernetes.io/name: prometheus +subjects: + - kind: ServiceAccount + name: susql-controller-manager + namespace: openshift-operators +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: prometheus-k8s diff --git a/internal/controller/labelgroup_controller.go b/internal/controller/labelgroup_controller.go index de582e6..d6ac6f3 100644 --- a/internal/controller/labelgroup_controller.go +++ b/internal/controller/labelgroup_controller.go @@ -70,6 +70,8 @@ var ( func (r *LabelGroupReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { _ = log.FromContext(ctx) + r.Logger.V(5).Info("[Reconcile] Entered Reconcile().") + // Get label group object to process if it exists labelGroup := &susqlv1.LabelGroup{} @@ -96,7 +98,7 @@ func (r *LabelGroupReconciler) Reconcile(ctx context.Context, req ctrl.Request) // Decide what action to take based on the state of the labelGroup switch labelGroup.Status.Phase { case susqlv1.Initializing: - r.Logger.V(2).Info("[Reconcile] Entered initializing case.") + r.Logger.V(5).Info("[Reconcile] Entered initializing case.") if len(labelGroup.Spec.Labels) > len(susqlPrometheusLabelNames) { r.Logger.V(0).Error(fmt.Errorf("[Reconcile] The number of provided labels is greater than the maximum number of supported labels (e.g., up to %d labels).", len(susqlPrometheusLabelNames)), "") return ctrl.Result{RequeueAfter: fixingDelay}, nil @@ -147,7 +149,7 @@ func (r *LabelGroupReconciler) Reconcile(ctx context.Context, req ctrl.Request) return ctrl.Result{}, nil case susqlv1.Reloading: - r.Logger.V(2).Info("[Reconcile] Entered reloading case.") + r.Logger.V(5).Info("[Reconcile] Entered reloading case.") // Reload data from existing database if !labelGroup.Spec.DisableUsingMostRecentValue { totalEnergy, err := r.GetMostRecentValue(labelGroup.Status.SusQLPrometheusQuery) @@ -171,11 +173,13 @@ func (r *LabelGroupReconciler) Reconcile(ctx context.Context, req ctrl.Request) return ctrl.Result{}, nil case susqlv1.Aggregating: - // r.Logger.V(2).Info("[Reconcile] Entered aggregating case.") // trace + r.Logger.V(5).Info("[Reconcile] Entered aggregating case.") // trace // Get list of pods matching the label group podNames, namespaceNames, err := r.GetPodNamesMatchingLabels(ctx, labelGroup) + r.Logger.V(5).Info(fmt.Sprintf("[Reconcile-Aggregating] podNames: %s", podNames)) // trace + r.Logger.V(5).Info(fmt.Sprintf("[Reconcile-Aggregating] namespaceNames: %s", namespaceNames)) // trace - if err != nil { + if err != nil || len(podNames) == 0 || len(namespaceNames) == 0 { r.Logger.V(0).Error(err, "[Reconcile] Couldn't get pods for the labels provided.") return ctrl.Result{}, err } @@ -236,7 +240,7 @@ func (r *LabelGroupReconciler) Reconcile(ctx context.Context, req ctrl.Request) return ctrl.Result{RequeueAfter: r.SamplingRate}, nil default: - r.Logger.V(2).Info("[Reconcile] Entered default case.") + r.Logger.V(5).Info("[Reconcile] Entered default case.") // First time seeing this object labelGroup.Status.Phase = susqlv1.Initializing @@ -254,7 +258,7 @@ func (r *LabelGroupReconciler) SetupWithManager(mgr ctrl.Manager) error { For(&susqlv1.LabelGroup{}). Complete(r) - r.Logger.V(2).Info("[SetupWithManager] Initializing Metrics Exporter.") + r.Logger.V(5).Info("[SetupWithManager] Initializing Metrics Exporter.") // Start server to export metrics r.InitializeMetricsExporter() diff --git a/internal/controller/prometheus_manager.go b/internal/controller/prometheus_manager.go index 9911e66..55ae59a 100644 --- a/internal/controller/prometheus_manager.go +++ b/internal/controller/prometheus_manager.go @@ -66,8 +66,8 @@ func (r *LabelGroupReconciler) GetMostRecentValue(susqlPrometheusQuery string) ( queryString := fmt.Sprintf("max_over_time(%s[%s])", susqlPrometheusQuery, maxQueryTime) results, warnings, err := v1api.Query(ctx, queryString, time.Now(), v1.WithTimeout(0*time.Second)) - r.Logger.V(2).Info(fmt.Sprintf("[GetMostRecentValue] Query: %s", queryString)) // trace - r.Logger.V(2).Info(fmt.Sprintf("[GetMostRecentValue] Results: '%v'", results)) // trace + r.Logger.V(5).Info(fmt.Sprintf("[GetMostRecentValue] Query: %s", queryString)) // trace + r.Logger.V(5).Info(fmt.Sprintf("[GetMostRecentValue] Results: '%v'", results)) // trace if len(warnings) > 0 { r.Logger.V(0).Info(fmt.Sprintf("WARNING [GetMostRecentValue] %v\n", warnings) + @@ -115,12 +115,17 @@ func (r *LabelGroupReconciler) GetMetricValuesForPodNames(metricName string, pod // new query for issue 2: can improve runtime efficiency... queryString := fmt.Sprintf("sum(%s{pod_name=\"%s\",container_namespace=\"%s\",mode=\"dynamic\"})", metricName, podNames[0], namespaceNames[0]) + queryString = queryString + "+" + fmt.Sprintf("sum(%s{pod_name=\"%s\",container_namespace=\"%s\",mode=\"idle\"})", metricName, podNames[0], namespaceNames[0]) for i := 1; i < len(podNames); i++ { queryString = queryString + "+" + fmt.Sprintf("sum(%s{pod_name=\"%s\",container_namespace=\"%s\",mode=\"dynamic\"})", metricName, podNames[i], namespaceNames[i]) + queryString = queryString + "+" + fmt.Sprintf("sum(%s{pod_name=\"%s\",container_namespace=\"%s\",mode=\"idle\"})", metricName, podNames[i], namespaceNames[i]) } results, warnings, err := v1api.Query(ctx, queryString, time.Now(), v1.WithTimeout(0*time.Second)) + r.Logger.V(5).Info(fmt.Sprintf("[GetMetricValuesForPodNames] Query: %s", queryString)) // trace + r.Logger.V(5).Info(fmt.Sprintf("[GetMetricValuesForPodNames] Results: '%v'", results)) // trace + if err != nil { r.Logger.V(0).Error(err, "[GetMetricValuesForPodNames] Querying Prometheus didn't work.\n"+ fmt.Sprintf("\tmetricName: %s\n", metricName)+ @@ -139,6 +144,7 @@ func (r *LabelGroupReconciler) GetMetricValuesForPodNames(metricName string, pod metricValues := make(map[string]float64, len(results.(model.Vector))) for _, result := range results.(model.Vector) { + r.Logger.V(5).Info(fmt.Sprintf("[GetMetricValuesForPodNames] Container id %s value is %f.", string(result.Metric["container_id"]), float64(result.Value))) // trace metricValues[string(result.Metric["container_id"])] = float64(result.Value) } @@ -164,7 +170,7 @@ var ( func (r *LabelGroupReconciler) InitializeMetricsExporter() { // Initiate the exporting of prometheus metrics for the energy - r.Logger.V(2).Info("Entering InitializeMetricsExporter().") + r.Logger.V(5).Info("Entering InitializeMetricsExporter().") if prometheusRegistry == nil { prometheusRegistry = prometheus.NewRegistry() prometheusRegistry.MustRegister(susqlMetrics.totalEnergy) @@ -194,7 +200,7 @@ func (r *LabelGroupReconciler) SetAggregatedEnergyForLabels(totalEnergy float64, // Save aggregated energy to Prometheus table susqlMetrics.totalEnergy.With(prometheusLabels).Set(totalEnergy) - r.Logger.V(2).Info(fmt.Sprintf("[SetAggregatedEnergyForLabels] Setting energy %f for %v.", totalEnergy, prometheusLabels)) // trace + r.Logger.V(5).Info(fmt.Sprintf("[SetAggregatedEnergyForLabels] Setting energy %f for %v.", totalEnergy, prometheusLabels)) // trace return nil } From fcf0cbe132ffb022671d58e6f7bec717bf90b3eb Mon Sep 17 00:00:00 2001 From: Scott Trent Date: Wed, 5 Jun 2024 15:47:54 +0900 Subject: [PATCH 3/3] remove experimental file Signed-off-by: Scott Trent --- config/rbac/susql-rbac2.yaml | 73 ------------------------------------ 1 file changed, 73 deletions(-) delete mode 100644 config/rbac/susql-rbac2.yaml diff --git a/config/rbac/susql-rbac2.yaml b/config/rbac/susql-rbac2.yaml deleted file mode 100644 index 5f1d49c..0000000 --- a/config/rbac/susql-rbac2.yaml +++ /dev/null @@ -1,73 +0,0 @@ -# bind susql service account to prometheus clusterrole -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRoleBinding -metadata: - labels: - app.kubernetes.io/component: prometheus - app.kubernetes.io/instance: k8s - app.kubernetes.io/name: prometheus - name: susql-operatorprometheus-k8s -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: prometheus-k8s -subjects: -- kind: ServiceAccount - name: susql-controller-manager - namespace: openshift-operators ---- -# allow susql service account to access metrics on openshift-kepler-operator (might not need due to the same namespace) -kind: Role -apiVersion: rbac.authorization.k8s.io/v1 -metadata: - name: prometheus-k8s - namespace: openshift-operators - labels: - app.kubernetes.io/component: prometheus - app.kubernetes.io/instance: k8s - app.kubernetes.io/name: prometheus -rules: - - verbs: - - get - - list - - watch - apiGroups: - - '' - resources: - - services - - endpoints - - pods - - verbs: - - get - - list - - watch - apiGroups: - - extensions - resources: - - ingresses - - verbs: - - get - - list - - watch - apiGroups: - - networking.k8s.io - resources: - - ingresses ---- -kind: RoleBinding -apiVersion: rbac.authorization.k8s.io/v1 -metadata: - name: susql-prometheus-k8s - namespace: openshift-operators - labels: - app.kubernetes.io/component: prometheus - app.kubernetes.io/instance: k8s - app.kubernetes.io/name: prometheus -subjects: - - kind: ServiceAccount - name: susql-controller-manager - namespace: openshift-operators -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: Role - name: prometheus-k8s