Skip to content

Commit

Permalink
Merge pull request #105 from sustainable-computing-io/idle
Browse files Browse the repository at this point in the history
Kepler Idle mode support
  • Loading branch information
trent-s authored Jun 5, 2024
2 parents 9cda53c + fcf0cbe commit ea1dac6
Show file tree
Hide file tree
Showing 8 changed files with 38 additions and 19 deletions.
2 changes: 1 addition & 1 deletion VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
0.0.16
0.0.17
12 changes: 6 additions & 6 deletions bundle/manifests/susql-operator.clusterserviceversion.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -22,14 +22,14 @@ metadata:
]
capabilities: Basic Install
categories: Monitoring
containerImage: quay.io/sustainable_computing_io/susql_operator:0.0.16
createdAt: "2024-05-30T08:31:43Z"
containerImage: quay.io/sustainable_computing_io/susql_operator:0.0.17
createdAt: "2024-06-05T06:38:56Z"
description: 'Aggregates energy data from pods tagged with SusQL labels '
operators.operatorframework.io/builder: operator-sdk-v1.34.1
operators.operatorframework.io/project_layout: go.kubebuilder.io/v4
repository: https://github.com/sustainable-computing-io/susql-operator
support: https://github.com/sustainable-computing-io/susql-operator/issues
name: susql-operator.v0.0.16
name: susql-operator.v0.0.17
namespace: placeholder
spec:
apiservicedefinitions: {}
Expand All @@ -50,7 +50,7 @@ spec:
### Prerequisites
1. Deployment of Kepler on the cluster
1. Deployment of [Kepler](https://sustainable-computing.io/) on the cluster
2. Ensure that [User Project Monitoring](https://docs.openshift.com/container-platform/latest/monitoring/enabling-monitoring-for-user-defined-projects.html)
is enabled to monitor energy consumed in user projects.
Expand Down Expand Up @@ -216,7 +216,7 @@ spec:
value: http://0.0.0.0:8082
- name: SAMPLING-RATE
value: "2"
image: quay.io/sustainable_computing_io/susql_operator:0.0.16
image: quay.io/sustainable_computing_io/susql_operator:0.0.17
imagePullPolicy: IfNotPresent
livenessProbe:
httpGet:
Expand Down Expand Up @@ -320,4 +320,4 @@ spec:
provider:
name: SusQL Operator Contributors
url: https://github.com/sustainable-computing-io/susql-operator
version: 0.0.16
version: 0.0.17
2 changes: 1 addition & 1 deletion cmd/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ func main() {

opts := zap.Options{
Development: true,
Level: zapcore.Level(-2),
Level: zapcore.Level(-5),
}
opts.BindFlags(flag.CommandLine)
flag.Parse()
Expand Down
2 changes: 1 addition & 1 deletion config/manager/kustomization.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,4 @@ kind: Kustomization
images:
- name: controller
newName: quay.io/sustainable_computing_io/susql_operator
newTag: 0.0.16
newTag: 0.0.17
16 changes: 10 additions & 6 deletions internal/controller/labelgroup_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,8 @@ var (
func (r *LabelGroupReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) {
_ = log.FromContext(ctx)

r.Logger.V(5).Info("[Reconcile] Entered Reconcile().")

// Get label group object to process if it exists
labelGroup := &susqlv1.LabelGroup{}

Expand All @@ -96,7 +98,7 @@ func (r *LabelGroupReconciler) Reconcile(ctx context.Context, req ctrl.Request)
// Decide what action to take based on the state of the labelGroup
switch labelGroup.Status.Phase {
case susqlv1.Initializing:
r.Logger.V(2).Info("[Reconcile] Entered initializing case.")
r.Logger.V(5).Info("[Reconcile] Entered initializing case.")
if len(labelGroup.Spec.Labels) > len(susqlPrometheusLabelNames) {
r.Logger.V(0).Error(fmt.Errorf("[Reconcile] The number of provided labels is greater than the maximum number of supported labels (e.g., up to %d labels).", len(susqlPrometheusLabelNames)), "")
return ctrl.Result{RequeueAfter: fixingDelay}, nil
Expand Down Expand Up @@ -147,7 +149,7 @@ func (r *LabelGroupReconciler) Reconcile(ctx context.Context, req ctrl.Request)
return ctrl.Result{}, nil

case susqlv1.Reloading:
r.Logger.V(2).Info("[Reconcile] Entered reloading case.")
r.Logger.V(5).Info("[Reconcile] Entered reloading case.")
// Reload data from existing database
if !labelGroup.Spec.DisableUsingMostRecentValue {
totalEnergy, err := r.GetMostRecentValue(labelGroup.Status.SusQLPrometheusQuery)
Expand All @@ -171,11 +173,13 @@ func (r *LabelGroupReconciler) Reconcile(ctx context.Context, req ctrl.Request)
return ctrl.Result{}, nil

case susqlv1.Aggregating:
// r.Logger.V(2).Info("[Reconcile] Entered aggregating case.") // trace
r.Logger.V(5).Info("[Reconcile] Entered aggregating case.") // trace
// Get list of pods matching the label group
podNames, namespaceNames, err := r.GetPodNamesMatchingLabels(ctx, labelGroup)
r.Logger.V(5).Info(fmt.Sprintf("[Reconcile-Aggregating] podNames: %s", podNames)) // trace
r.Logger.V(5).Info(fmt.Sprintf("[Reconcile-Aggregating] namespaceNames: %s", namespaceNames)) // trace

if err != nil {
if err != nil || len(podNames) == 0 || len(namespaceNames) == 0 {
r.Logger.V(0).Error(err, "[Reconcile] Couldn't get pods for the labels provided.")
return ctrl.Result{}, err
}
Expand Down Expand Up @@ -236,7 +240,7 @@ func (r *LabelGroupReconciler) Reconcile(ctx context.Context, req ctrl.Request)
return ctrl.Result{RequeueAfter: r.SamplingRate}, nil

default:
r.Logger.V(2).Info("[Reconcile] Entered default case.")
r.Logger.V(5).Info("[Reconcile] Entered default case.")
// First time seeing this object
labelGroup.Status.Phase = susqlv1.Initializing

Expand All @@ -254,7 +258,7 @@ func (r *LabelGroupReconciler) SetupWithManager(mgr ctrl.Manager) error {
For(&susqlv1.LabelGroup{}).
Complete(r)

r.Logger.V(2).Info("[SetupWithManager] Initializing Metrics Exporter.")
r.Logger.V(5).Info("[SetupWithManager] Initializing Metrics Exporter.")

// Start server to export metrics
r.InitializeMetricsExporter()
Expand Down
14 changes: 10 additions & 4 deletions internal/controller/prometheus_manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -66,8 +66,8 @@ func (r *LabelGroupReconciler) GetMostRecentValue(susqlPrometheusQuery string) (
queryString := fmt.Sprintf("max_over_time(%s[%s])", susqlPrometheusQuery, maxQueryTime)
results, warnings, err := v1api.Query(ctx, queryString, time.Now(), v1.WithTimeout(0*time.Second))

r.Logger.V(2).Info(fmt.Sprintf("[GetMostRecentValue] Query: %s", queryString)) // trace
r.Logger.V(2).Info(fmt.Sprintf("[GetMostRecentValue] Results: '%v'", results)) // trace
r.Logger.V(5).Info(fmt.Sprintf("[GetMostRecentValue] Query: %s", queryString)) // trace
r.Logger.V(5).Info(fmt.Sprintf("[GetMostRecentValue] Results: '%v'", results)) // trace

if len(warnings) > 0 {
r.Logger.V(0).Info(fmt.Sprintf("WARNING [GetMostRecentValue] %v\n", warnings) +
Expand Down Expand Up @@ -115,12 +115,17 @@ func (r *LabelGroupReconciler) GetMetricValuesForPodNames(metricName string, pod

// new query for issue 2: can improve runtime efficiency...
queryString := fmt.Sprintf("sum(%s{pod_name=\"%s\",container_namespace=\"%s\",mode=\"dynamic\"})", metricName, podNames[0], namespaceNames[0])
queryString = queryString + "+" + fmt.Sprintf("sum(%s{pod_name=\"%s\",container_namespace=\"%s\",mode=\"idle\"})", metricName, podNames[0], namespaceNames[0])
for i := 1; i < len(podNames); i++ {
queryString = queryString + "+" + fmt.Sprintf("sum(%s{pod_name=\"%s\",container_namespace=\"%s\",mode=\"dynamic\"})", metricName, podNames[i], namespaceNames[i])
queryString = queryString + "+" + fmt.Sprintf("sum(%s{pod_name=\"%s\",container_namespace=\"%s\",mode=\"idle\"})", metricName, podNames[i], namespaceNames[i])
}

results, warnings, err := v1api.Query(ctx, queryString, time.Now(), v1.WithTimeout(0*time.Second))

r.Logger.V(5).Info(fmt.Sprintf("[GetMetricValuesForPodNames] Query: %s", queryString)) // trace
r.Logger.V(5).Info(fmt.Sprintf("[GetMetricValuesForPodNames] Results: '%v'", results)) // trace

if err != nil {
r.Logger.V(0).Error(err, "[GetMetricValuesForPodNames] Querying Prometheus didn't work.\n"+
fmt.Sprintf("\tmetricName: %s\n", metricName)+
Expand All @@ -139,6 +144,7 @@ func (r *LabelGroupReconciler) GetMetricValuesForPodNames(metricName string, pod
metricValues := make(map[string]float64, len(results.(model.Vector)))

for _, result := range results.(model.Vector) {
r.Logger.V(5).Info(fmt.Sprintf("[GetMetricValuesForPodNames] Container id %s value is %f.", string(result.Metric["container_id"]), float64(result.Value))) // trace
metricValues[string(result.Metric["container_id"])] = float64(result.Value)
}

Expand All @@ -164,7 +170,7 @@ var (

func (r *LabelGroupReconciler) InitializeMetricsExporter() {
// Initiate the exporting of prometheus metrics for the energy
r.Logger.V(2).Info("Entering InitializeMetricsExporter().")
r.Logger.V(5).Info("Entering InitializeMetricsExporter().")
if prometheusRegistry == nil {
prometheusRegistry = prometheus.NewRegistry()
prometheusRegistry.MustRegister(susqlMetrics.totalEnergy)
Expand Down Expand Up @@ -194,7 +200,7 @@ func (r *LabelGroupReconciler) SetAggregatedEnergyForLabels(totalEnergy float64,
// Save aggregated energy to Prometheus table
susqlMetrics.totalEnergy.With(prometheusLabels).Set(totalEnergy)

r.Logger.V(2).Info(fmt.Sprintf("[SetAggregatedEnergyForLabels] Setting energy %f for %v.", totalEnergy, prometheusLabels)) // trace
r.Logger.V(5).Info(fmt.Sprintf("[SetAggregatedEnergyForLabels] Setting energy %f for %v.", totalEnergy, prometheusLabels)) // trace

return nil
}
4 changes: 4 additions & 0 deletions test/clean.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
oc delete -f labelgroups.yaml
oc delete -f energy-consumer-job.yaml
oc delete -f training-job-1.yaml
oc delete -f training-job-2.yaml
5 changes: 5 additions & 0 deletions test/start.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
oc apply -f labelgroups.yaml
sleep 10
oc apply -f energy-consumer-job.yaml
oc apply -f training-job-1.yaml
oc apply -f training-job-2.yaml

0 comments on commit ea1dac6

Please sign in to comment.