Skip to content

Commit

Permalink
enable customization of Kepler Metric
Browse files Browse the repository at this point in the history
Signed-off-by: Scott Trent <[email protected]>
  • Loading branch information
trent-s committed Mar 15, 2024
1 parent a006409 commit 32863fd
Show file tree
Hide file tree
Showing 6 changed files with 39 additions and 3 deletions.
22 changes: 22 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -84,3 +84,25 @@ Energy of the group of pods is exposed in 2 ways:
* Through Prometheus at `http://prometheus-susql.openshift-kepler-operator.svc.cluster.local:9090` using the query `susql_total_energy_joules{susql_label_1=my-label-1,susql_label_2=my-label-2}`
* From `status` of the `LabelGroup` CRD given as `labelgroup.status.totalEnergy`
### Installation Configuration Options
|----------------------------|----------------------------|--------------------------------------------|
| Environmental Variable | Default Value | Description |
| SUSQL_NAMESPACE | openshift-kepler-operator | namespace that SUSQL resources run in |
| KEPLER_PROMETHEUS_NAMESPACE | openshift-monitoring | namespace that Kepler Prometheus runs in |
| PROMETHEUS_PROTOCOL | http | Either http or https for Kepler Prometheus access |
| PROMETHEUS_SERVICE | prometheus-k8s | service name for the Kepler Prometheus |
| PROMETHEUS_NAMESPACE | monitoring | namespace used by the Kepler Prometheus |
| PROMETHEUS_DOMAIN | svc.cluster.local | Domain used by the Kepler Prometheus |
| PROMETHEUS_PORT | 9090 | Port used by the Kepler Prometheu |
| KEPLER_PROMETHEUS_URL | http://prometheus-k8s.monitoring.svc.cluster.local:9090 | A shortcut to specify final Kepler Prometheus URL |
| KEPLER_METRIC_NAME | kepler_container_joules_total | Metric queried in the Kepler Prometheus |
| SUSQL_PROMETHEUS_URL | http://prometheus-susql.openshift-kepler-operator.svc.cluster.local:9090 | SusQL Prometheus URL |
| SUSQL_SAMPLING_RATE | 2 | Sampling rate in seconds |
| SUSQL_ENHANCED | | If set to any string, then use enhanced RBAC and SMON configuration |
| SUSQL_REGISTRY | quay.io/sustainable_computing_io | Container registry that SusQL is stored in |
| SUSQL_IMAGE_NAME | susql_operator | Image name used on SusQL container registry |
| SUSQL_IMAGE_TAG | latest | Tag for SusQL container |
|----------------------------|----------------------------|---------------------------------------------|
5 changes: 4 additions & 1 deletion cmd/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ func main() {
var enableLeaderElection bool
var probeAddr string
var keplerPrometheusUrl string
var keplerMetricName string
var susqlPrometheusMetricsUrl string
var susqlPrometheusDatabaseUrl string
var samplingRate string
Expand All @@ -63,9 +64,10 @@ func main() {
flag.StringVar(&probeAddr, "health-probe-bind-address", ":8081", "The address the probe endpoint binds to.")
flag.BoolVar(&enableLeaderElection, "leader-elect", false, "Enable leader election for controller manager. Enabling this will ensure there is only one active controller manager.")
flag.StringVar(&keplerPrometheusUrl, "kepler-prometheus-url", "", "The URL for the Prometheus server where Kepler stores the energy data")
flag.StringVar(&keplerMetricName, "kepler-metric-name", "kepler_container_joules_total", "The metric name to be queried in the kepler Prometheus server")
flag.StringVar(&susqlPrometheusDatabaseUrl, "susql-prometheus-database-url", "", "The URL for the Prometheus database where SusQL stores the energy data")
flag.StringVar(&susqlPrometheusMetricsUrl, "susql-prometheus-metrics-url", "", "The URL for the Prometheus metrics where SusQL exposes the energy data")
flag.StringVar(&samplingRate, "sampling-rate", "", "Sampling rate in seconds")
flag.StringVar(&samplingRate, "sampling-rate", "2", "Sampling rate in seconds")

opts := zap.Options{
Development: true,
Expand Down Expand Up @@ -108,6 +110,7 @@ func main() {
Client: mgr.GetClient(),
Scheme: mgr.GetScheme(),
KeplerPrometheusUrl: keplerPrometheusUrl,
KeplerMetricName: keplerMetricName,
SusQLPrometheusDatabaseUrl: susqlPrometheusDatabaseUrl,
SusQLPrometheusMetricsUrl: susqlPrometheusMetricsUrl,
SamplingRate: time.Duration(samplingRateInteger) * time.Second,
Expand Down
9 changes: 9 additions & 0 deletions deployment/deploy.sh
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,10 @@ if [[ -z ${KEPLER_PROMETHEUS_URL} ]]; then
KEPLER_PROMETHEUS_URL="${PROMETHEUS_PROTOCOL}://${PROMETHEUS_SERVICE}.${PROMETHEUS_NAMESPACE}.${PROMETHEUS_DOMAIN}:${PROMETHEUS_PORT}"
fi

if [[ -z ${KEPLER_METRIC_NAME} ]]; then
KEPLER_METRIC_NAME="kepler_container_joules_total"
fi

if [[ -z ${SUSQL_PROMETHEUS_URL} ]]; then
if [[ -z ${SHARED_PROMETHEUS} ]]; then
# using separate prometheus instance
Expand Down Expand Up @@ -102,6 +106,7 @@ echo "PROMETHEUS_NAMESPACE - '${PROMETHEUS_NAMESPACE}'"
echo "PROMETHEUS_DOMAIN - '${PROMETHEUS_DOMAIN}'"
echo "PROMETHEUS_PORT - '${PROMETHEUS_PORT}'"
echo "KEPLER_PROMETHEUS_URL - '${KEPLER_PROMETHEUS_URL}'"
echo "KEPLER_METRIC_NAME - '${KEPLER_METRIC_NAME}'"
echo "SUSQL_PROMETHEUS_URL - '${SUSQL_PROMETHEUS_URL}'"
echo "SUSQL_SAMPLING_RATE - '${SUSQL_SAMPLING_RATE}'"
echo "SUSQL_ENHANCED - '${SUSQL_ENHANCED}'"
Expand All @@ -112,6 +117,8 @@ echo "==========================================================================
# Actions to perform, separated by comma
actions=${1:-"kepler-check,prometheus-undeploy,prometheus-deploy,susql-undeploy,susql-deploy"}

exit

# output deploy information
LOGFILE=.susql-deploy-info.txt
LASTLOGFILE=.susql-deploy-info-last.txt
Expand All @@ -130,6 +137,7 @@ echo "export PROMETHEUS_NAMESPACE=${PROMETHEUS_NAMESPACE}" >> ${LOGFILE}
echo "export PROMETHEUS_DOMAIN=${PROMETHEUS_DOMAIN}" >> ${LOGFILE}
echo "export PROMETHEUS_PORT=${PROMETHEUS_PORT}" >> ${LOGFILE}
echo "export KEPLER_PROMETHEUS_URL=${KEPLER_PROMETHEUS_URL}" >> ${LOGFILE}
echo "export KEPLER_METRIC_NAME=${KEPLER_METRIC_NAME}" >> ${LOGFILE}
echo "export SUSQL_PROMETHEUS_URL=${SUSQL_PROMETHEUS_URL}" >> ${LOGFILE}
echo "export SUSQL_SAMPLING_RATE=${SUSQL_SAMPLING_RATE}" >> ${LOGFILE}
echo "export SUSQL_ENHANCED=${SUSQL_ENHANCED}" >> ${LOGFILE}
Expand Down Expand Up @@ -219,6 +227,7 @@ do
cd -
helm upgrade --install --wait susql-controller ${SUSQL_DIR}/deployment/susql-controller --namespace ${SUSQL_NAMESPACE} \
--set keplerPrometheusUrl="${KEPLER_PROMETHEUS_URL}" \
--set keplerMetricName="${KEPLER_METRIC_NAME}" \
--set susqlPrometheusDatabaseUrl="${SUSQL_PROMETHEUS_URL}" \
--set susqlPrometheusMetricsUrl="http://0.0.0.0:8082" \
--set samplingRate="${SUSQL_SAMPLING_RATE}" \
Expand Down
1 change: 1 addition & 0 deletions deployment/susql-controller/templates/deployment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ spec:
imagePullPolicy: {{ .Values.imagePullPolicy | default "IfNotPresent" }}
args:
- "--kepler-prometheus-url={{ .Values.keplerPrometheusUrl }}"
- "--kepler-metric-name={{ .Values.KeplerMetricName }}"
- "--susql-prometheus-database-url={{ .Values.susqlPrometheusDatabaseUrl }}"
- "--susql-prometheus-metrics-url={{ .Values.susqlPrometheusMetricsUrl }}"
- "--sampling-rate={{ .Values.samplingRate }}"
Expand Down
1 change: 1 addition & 0 deletions deployment/susql-controller/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ requests:
# Communication with Kepler and Prometheus
#####################
keplerPrometheusUrl: "http://prometheus-k8s.monitoring.svc.cluster.local:9090"
keplerMetricName: "kepler_container_joules_total"
susqlPrometheusDatabaseUrl: "http://prometheus-susql.openshift-kepler-operator.svc.cluster.local:9090"
susqlPrometheusMetricsUrl: "http://0.0.0.0:8082"
samplingRate: "2"
4 changes: 2 additions & 2 deletions internal/controller/labelgroup_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -35,13 +35,13 @@ type LabelGroupReconciler struct {
client.Client
Scheme *runtime.Scheme
KeplerPrometheusUrl string
KeplerMetricName string
SusQLPrometheusDatabaseUrl string
SusQLPrometheusMetricsUrl string
SamplingRate time.Duration // Sampling rate for all the label groups
}

const (
keplerMetricName = "kepler_container_joules_total" // Kepler metric to query
susqlMetricName = "susql_total_energy_joules" // SusQL metric to query
fixingDelay = 15 * time.Second // Time to wait in the even the label group was badly constructed
errorDelay = 1 * time.Second // Time to wait when an error happens due to network connectivity issues
Expand Down Expand Up @@ -175,7 +175,7 @@ func (r *LabelGroupReconciler) Reconcile(ctx context.Context, req ctrl.Request)
}

// Aggregate Kepler measurements for these set of pods
metricValues, err := r.GetMetricValuesForPodNames(keplerMetricName, podNames, namespaceNames)
metricValues, err := r.GetMetricValuesForPodNames(r.KeplerMetricName, podNames, namespaceNames)

if err != nil {
fmt.Printf("ERROR [Reconcile]: Querying Prometheus didn't work: %v\n", err)
Expand Down

0 comments on commit 32863fd

Please sign in to comment.