diff --git a/.gitattributes b/.gitattributes index b150ffa1..d2142e58 100644 --- a/.gitattributes +++ b/.gitattributes @@ -1,3 +1,4 @@ *.json linguist-detectable *.yml linguist-detectable -*.yaml linguist-detectable \ No newline at end of file +*.yaml linguist-detectable +*.tpl linguist-language=Go \ No newline at end of file diff --git a/.github/workflows/ci-github-cms.yaml b/.github/workflows/ci-github-cms.yaml index a86afb1e..84ecd64a 100644 --- a/.github/workflows/ci-github-cms.yaml +++ b/.github/workflows/ci-github-cms.yaml @@ -51,6 +51,9 @@ jobs: - name: Deploy Helm chart run: | + helm repo add grafana https://grafana.github.io/helm-charts + helm repo update + helm dependency build ./helm/supersonic helm upgrade --install supersonic ./helm/supersonic \ --values values/values-cms-ci.yaml -n cms @@ -64,12 +67,12 @@ jobs: - name: Prometheus ready run: | - kubectl wait --for condition=Ready pod -l app.kubernetes.io/component=prometheus --timeout 120s -n cms - kubectl get svc,pod -l app.kubernetes.io/component=prometheus -n cms + kubectl wait --for condition=Ready pod -l app.kubernetes.io/name=prometheus --timeout 120s -n cms + kubectl get svc,pod -l app.kubernetes.io/name=prometheus -n cms - name: Grafana ready run: | - kubectl wait --for condition=Ready pod -l app.kubernetes.io/component=grafana --timeout 120s -n cms + kubectl wait --for condition=Ready pod -l app.kubernetes.io/name=grafana --timeout 120s -n cms - name: Triton server ready run: | diff --git a/.github/workflows/ci-local.sh b/.github/workflows/ci-local.sh new file mode 100644 index 00000000..0fa83ce3 --- /dev/null +++ b/.github/workflows/ci-local.sh @@ -0,0 +1,93 @@ +#!/bin/bash + +echo "Starting deployment process..." + +# 1. Create a Kubernetes cluster with Kind +echo "Creating Kind cluster..." +kind create cluster --name gh-k8s-cluster + +# 2. (Assuming Helm is installed and at the proper version) + +# 3. Create CMS namespace +echo "Creating CMS namespace..." +kubectl create namespace cms + +# 4. Install Prometheus Operator CRDs +echo "Installing Prometheus Operator CRDs..." +helm repo add prometheus-community https://prometheus-community.github.io/helm-charts +helm repo update +kubectl create namespace monitoring +helm install prometheus-operator prometheus-community/kube-prometheus-stack \ + --namespace monitoring \ + --set prometheusOperator.createCustomResource=false \ + --set defaultRules.create=false \ + --set alertmanager.enabled=false \ + --set prometheus.enabled=false \ + --set grafana.enabled=false + +# 5. Install KEDA Autoscaler +echo "Installing KEDA Autoscaler..." +helm repo add kedacore https://kedacore.github.io/charts +helm repo update +kubectl create namespace keda +helm install keda kedacore/keda --namespace keda + +# 6. Mount CVMFS +echo "Mounting CVMFS..." +kubectl create namespace cvmfs-csi +helm install -n cvmfs-csi cvmfs-csi oci://registry.cern.ch/kubernetes/charts/cvmfs-csi \ + --values ci/values-cvmfs-csi.yaml +kubectl apply -f ci/cvmfs-storageclass.yaml -n cvmfs-csi + +# 7. Deploy the Helm chart for supersonic +echo "Deploying Helm chart for supersonic..." +helm repo add grafana https://grafana.github.io/helm-charts +helm repo update +helm dependency build ./helm/supersonic +helm upgrade --install supersonic ./helm/supersonic --values values/values-cms-ci.yaml -n cms + +# 8. Wait for components to become ready + +echo "Waiting for CVMFS pods to be ready..." +kubectl wait --for=condition=Ready pod --all -n cvmfs-csi --timeout 120s + +echo "Waiting for Envoy proxy pods to be ready..." +kubectl wait --for=condition=Ready pod -l app.kubernetes.io/component=envoy --timeout 120s -n cms + +echo "Waiting for Prometheus pods to be ready..." +kubectl wait --for=condition=Ready pod -l app.kubernetes.io/name=prometheus --timeout 120s -n cms +kubectl get svc,pod -l app.kubernetes.io/name=prometheus -n cms + +echo "Waiting for Grafana pods to be ready..." +kubectl wait --for=condition=Ready pod -l app.kubernetes.io/name=grafana --timeout 120s -n cms + +echo "Waiting for Triton server pods to be ready..." +kubectl wait --for=condition=Ready pod -l app.kubernetes.io/component=triton --timeout 300s -n cms + +echo "Waiting for KEDA Autoscaler to be ready..." +kubectl wait --for=condition=AbleToScale hpa -l app.kubernetes.io/component=keda --timeout 120s -n cms +kubectl wait --for=condition=Ready so -l app.kubernetes.io/component=keda --timeout 120s -n cms + +# 9. Validate the Deployment +echo "Validating Deployment in 'cms' namespace..." +kubectl get all -n cms + +# 10. Run Perf Analyzer Job +echo "Running Perf Analyzer Job..." +kubectl apply -f ci/perf-analyzer-job.yaml +kubectl wait --for=condition=complete job/perf-analyzer-job -n cms --timeout=180s || { + echo "Perf-analyzer job did not complete in time or failed." + exit 1 +} + +# Retrieve and print the logs from the Perf Analyzer pod +POD_NAME=$(kubectl get pods -n cms -l job-name=perf-analyzer-job -o jsonpath="{.items[0].metadata.name}") +echo "========== Perf Analyzer Logs ==========" +kubectl logs -n cms "$POD_NAME" +echo "========================================" + +# 11. Cleanup the Kind cluster +echo "Cleaning up: Deleting Kind cluster..." +kind delete cluster --name gh-k8s-cluster + +echo "Deployment process completed successfully!" \ No newline at end of file diff --git a/.github/workflows/helm-lint.yaml b/.github/workflows/helm-lint.yaml index cc5285ae..12e0bf25 100644 --- a/.github/workflows/helm-lint.yaml +++ b/.github/workflows/helm-lint.yaml @@ -55,6 +55,10 @@ jobs: - name: Lint values.yaml files in values/ directory run: | + helm repo add grafana https://grafana.github.io/helm-charts + helm repo add prometheus-community https://prometheus-community.github.io/helm-charts + helm repo update + helm dependency build ./helm/supersonic CHART_PATH="helm/supersonic/" VALUES_DIR="values/" diff --git a/.gitignore b/.gitignore index 749259dd..5e5a8582 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,4 @@ # Sphinx Documentation -docs/_build \ No newline at end of file +docs/_build + +*.tgz \ No newline at end of file diff --git a/README.md b/README.md index 5164ad40..4e38dbe3 100644 --- a/README.md +++ b/README.md @@ -26,6 +26,8 @@ The main components of SuperSONIC are: ``` helm repo add fastml https://fastmachinelearning.org/SuperSONIC +helm repo add prometheus-community https://prometheus-community.github.io/helm-charts +helm repo add grafana https://grafana.github.io/helm-charts helm repo update helm install fastml/supersonic --values -n ``` diff --git a/docs/.values-table.md b/docs/.values-table.md index c289d8f5..650339b4 100644 --- a/docs/.values-table.md +++ b/docs/.values-table.md @@ -3,6 +3,8 @@ | Key | Type | Default | Description | |-----|------|---------|-------------| | nameOverride | string | `""` | Unique identifier of SuperSONIC instance (equal to release name by default) | +| serverLoadMetric | string | `""` | A metric used by both KEDA autoscaler and Envoy's prometheus-based rate limiter. # Default metric (inference queue latency) is defined in templates/_helpers.tpl | +| serverLoadThreshold | int | `100` | Threshold for the metric | | triton.replicas | int | `1` | Number of Triton server instances (if autoscaling is disabled) | | triton.image | string | `"nvcr.io/nvidia/tritonserver:24.12-py3-min"` | Docker image for the Triton server | | triton.command | list | `["/bin/sh","-c"]` | Command and arguments to run in Triton container | @@ -22,6 +24,7 @@ | envoy.resources | object | `{"limits":{"cpu":2,"memory":"4G"},"requests":{"cpu":1,"memory":"2G"}}` | Resource requests and limits for Envoy Proxy. Note: an Envoy Proxy with too many connections might run out of CPU | | envoy.service.type | string | `"ClusterIP"` | This is the client-facing endpoint. In order to be able to connect to it, either enable ingress, or use type: LoadBalancer. | | envoy.service.ports | list | `[{"name":"grpc","port":8001,"targetPort":8001},{"name":"admin","port":9901,"targetPort":9901}]` | Envoy Service ports | +| envoy.ingress | object | `{"annotations":{},"enabled":false,"hostName":"","ingressClassName":""}` | Ingress configuration for Envoy | | envoy.grpc_route_timeout | string | `"0s"` | Timeout for gRPC route in Envoy; disabled by default (0s), preventing Envoy from closing connections too early. | | envoy.rate_limiter.listener_level | object | `{"enabled":false,"fill_interval":"12s","max_tokens":5,"tokens_per_fill":1}` | This rate limiter explicitly controls the number of client connections to the Envoy Proxy. | | envoy.rate_limiter.listener_level.enabled | bool | `false` | Enable rate limiter | @@ -47,22 +50,25 @@ | autoscaler.scaleDown.window | int | `600` | | | autoscaler.scaleDown.period | int | `120` | | | autoscaler.scaleDown.stepsize | int | `1` | | -| prometheus | object | `{"external":true,"ingress":{"annotations":{},"enabled":false,"hostName":"","ingressClassName":""},"port":443,"scheme":"https","serverLoadMetric":"","serverLoadThreshold":100,"url":""}` | Connection to a Prometheus server is required for KEDA autoscaler and Envoy's prometheus-based rate limiter | -| prometheus.external | bool | `true` | Whether to use external Prometheus instance (true) or deploy internal one (false) | -| prometheus.url | string | `""` | External Prometheus server url and port number (find in documentation of a given cluster or ask admins) Only used when external=true | -| prometheus.scheme | string | `"https"` | Specify whether external Prometheus endpoint is exposed as http or https Only used when external=true | -| prometheus.serverLoadMetric | string | `""` | A metric used by both KEDA autoscaler and Envoy's prometheus-based rate limiter. # Default metric (inference queue latency) is defined in templates/_helpers.tpl | -| prometheus.serverLoadThreshold | int | `100` | Threshold for the metric | -| prometheus.ingress | object | `{"annotations":{},"enabled":false,"hostName":"","ingressClassName":""}` | Ingress configuration for internal Prometheus web UI (only used when external=false) | -| ingress.enabled | bool | `false` | | -| ingress.hostName | string | `""` | | -| ingress.ingressClassName | string | `""` | | -| ingress.annotations | object | `{}` | | | nodeSelector | object | `{}` | Node selector for all pods (Triton and Envoy) | | tolerations | list | `[]` | Tolerations for all pods (Triton and Envoy) | -| grafana.enabled | bool | `false` | Enable or disable Grafana deployment | -| grafana.ingress | object | `{"annotations":{},"enabled":false,"hostName":"","ingressClassName":"haproxy"}` | Ingress configuration for Grafana | -| grafana.ingress.enabled | bool | `false` | Enable or disable ingress for Grafana | -| grafana.ingress.hostName | string | `""` | Hostname for Grafana ingress | -| grafana.ingress.ingressClassName | string | `"haproxy"` | Ingress class name (e.g. nginx, haproxy) | -| grafana.ingress.annotations | object | `{}` | Additional annotations for Grafana ingress | \ No newline at end of file +| prometheus | object | `{"alertmanager":{"enabled":false},"configmapReload":{"prometheus":{"enabled":false}},"enabled":false,"external":{"enabled":false,"port":443,"scheme":"https","url":""},"kube-state-metrics":{"enabled":false},"prometheus-node-exporter":{"enabled":false},"prometheus-pushgateway":{"enabled":false},"pushgateway":{"enabled":false},"rbac":{"create":false},"server":{"configMapOverrideName":"prometheus-config","global":{"evaluation_interval":"5s","scrape_interval":"5s"},"ingress":{"annotations":{},"enabled":false,"hosts":[],"ingressClassName":"","tls":[{"hosts":[]}]},"persistentVolume":{"enabled":false},"releaseNamespace":true,"resources":{"limits":{"cpu":1,"memory":"1Gi"},"requests":{"cpu":"500m","memory":"512Mi"}},"retention":"15d","service":{"enabled":true,"servicePort":9090},"useExistingClusterRoleName":"supersonic-prometheus-role"},"serviceAccounts":{"server":{"create":false,"name":"supersonic-prometheus-sa"}}}` | Connection to a Prometheus server is required for KEDA autoscaler and Envoy's prometheus-based rate limiter | +| prometheus.external.enabled | bool | `false` | Enable external Prometheus instance | +| prometheus.external.url | string | `""` | External Prometheus server url | +| prometheus.external.port | int | `443` | External Prometheus server port number | +| prometheus.external.scheme | string | `"https"` | Specify whether external Prometheus endpoint is exposed as http or https | +| prometheus.enabled | bool | `false` | Enable or disable Prometheus subchart deployment | +| prometheus.server | object | `{"configMapOverrideName":"prometheus-config","global":{"evaluation_interval":"5s","scrape_interval":"5s"},"ingress":{"annotations":{},"enabled":false,"hosts":[],"ingressClassName":"","tls":[{"hosts":[]}]},"persistentVolume":{"enabled":false},"releaseNamespace":true,"resources":{"limits":{"cpu":1,"memory":"1Gi"},"requests":{"cpu":"500m","memory":"512Mi"}},"retention":"15d","service":{"enabled":true,"servicePort":9090},"useExistingClusterRoleName":"supersonic-prometheus-role"}` | Prometheus Helm chart configuration (https://github.com/prometheus-community/helm-charts/tree/main/charts/prometheus) | +| grafana.enabled | bool | `false` | | +| grafana.adminUser | string | `"admin"` | | +| grafana.adminPassword | string | `"admin"` | | +| grafana.persistence.enabled | bool | `false` | | +| grafana.rbac.create | bool | `false` | | +| grafana.serviceAccount.create | bool | `false` | | +| grafana.datasources | object | `{"datasources.yaml":{"apiVersion":1,"datasources":[{"access":"proxy","isDefault":true,"jsonData":{"timeInterval":"5s","tlsSkipVerify":true},"name":"prometheus","type":"prometheus","url":"http://supersonic-prometheus-server:9090"}]}}` | Grafana datasources configuration | +| grafana.dashboardProviders | object | `{"dashboardproviders.yaml":{"apiVersion":1,"providers":[{"disableDeletion":false,"editable":true,"folder":"","name":"default","options":{"path":"/var/lib/grafana/dashboards/default"},"orgId":1,"type":"file"}]}}` | Grafana dashboard providers configuration | +| grafana.dashboardsConfigMaps | object | `{"default":"supersonic-grafana-default-dashboard"}` | Grafana dashboard ConfigMaps | +| grafana."grafana.ini" | object | `{"auth":{"disable_login_form":true},"auth.anonymous":{"enabled":true,"org_role":"Admin"},"dashboards":{"default_home_dashboard_path":"/var/lib/grafana/dashboards/default/default.json"}}` | Grafana.ini configuration | +| grafana.resources | object | `{"limits":{"cpu":1,"memory":"1Gi"},"requests":{"cpu":"100m","memory":"128Mi"}}` | Resource limits and requests for Grafana | +| grafana.service | object | `{"port":80,"targetPort":3000,"type":"ClusterIP"}` | Service configuration | +| grafana.ingress | object | `{"annotations":{},"enabled":false,"hosts":[],"ingressClassName":"","path":"/","pathType":"ImplementationSpecific","tls":[]}` | Ingress configuration | \ No newline at end of file diff --git a/docs/configuration-guide.rst b/docs/configuration-guide.rst index a249fb5b..cdd57845 100644 --- a/docs/configuration-guide.rst +++ b/docs/configuration-guide.rst @@ -5,9 +5,6 @@ The following guide will help you configure ``values.yaml`` file for a SuperSONI The full list of parameters can be found in the `Configuration Reference `_. -Triton Inference Server Configuration -**************************************** - 1. Select a Triton Inference Server version ============================================= @@ -127,15 +124,11 @@ Triton version must be specified in the ``triton.image`` parameter in the values - NVIDIA-L4 -Envoy Proxy Configuration -**************************************** +4. Configure Envoy Proxy +================================================ By default, Envoy proxy is enabled and configured to provide per-request load balancing between Triton inference servers. - -4. Configure external endpoint for Envoy Proxy -================================================ - Once the SuperSONIC server is installed, you need an URL to which clients can connect and send inference requests. There are two options: @@ -144,10 +137,12 @@ There are two options: You can configure the Ingress resource via the ``ingress`` parameters in the values file: .. code-block:: yaml - - ingress: - enabled: false - hostName: "" + envoy: + ingress: + enabled: true + host: "" + ingressClassName: "" + annotations: {} In this case, the client connections should be established to ``:443`` and use SSL. @@ -155,7 +150,7 @@ There are two options: not be allowed at some Kubernetes clusters. To enable this, set the following parameters in the values file: - ``envoy.service.type: LoadBalancer`` - - ``ingress.enabled: false`` + - ``envoy.ingress.enabled: false`` The LoadBalancer service can then be mapped to an external URL, depending on the settings of a given cluster. Please contact cluster administrators for more information. @@ -201,11 +196,23 @@ There are two types of rate limiting available in Envoy Proxy: *listener-level*, The metric and threshold for the Prometheus-based rate limiter are the same as those used for the autoscaler (see Prometheus Configuration). +6. (optional) Configure authentication in Envoy Proxy +====================================================== + +At the moment, the only supported authentication method is JWT. Example configuration for IceCube: + +.. code-block:: yaml -Prometheus Configuration -**************************************** + envoy: + auth: + enabled: true + jwt_issuer: https://keycloak.icecube.wisc.edu/auth/realms/IceCube + jwt_remote_jwks_uri: https://keycloak.icecube.wisc.edu/auth/realms/IceCube/protocol/openid-connect/certs + audiences: [icecube] + url: keycloak.icecube.wisc.edu + port: 443 -6. Deploy a Prometheus server or connect to an existing one +7. Deploy a Prometheus server or connect to an existing one ============================================================ Prometheus is needed to scrape metrics for monitoring, as well as for the rate limiter and autoscaler. @@ -216,66 +223,92 @@ Prometheus is needed to scrape metrics for monitoring, as well as for the rate l rate limiter and autoscaler. Prometheus server typically uses only a small amount of resources and does not require special permissions for installation. + This option installs Prometheus as a subchart, reasonable default values are pre-configured. + You can further customize the Prometheus installation by passing parameters from + official Prometheus `values.yaml `_ file + under the ``prometheus`` section of the SuperSONIC values file: + .. code-block:: yaml prometheus: - external: false - ingress: - enabled: true - hostName: "" + enabled: true + + + The parameters you will most likely need to configure in your values file are related to + Ingress for web access to Prometheus UI. + + .. warning:: + + This option requires permissions to list pods in the installation namespace. + Permission validation is performed automatically: if you don't have the necessary permissions, + an error message will be printed when running ``helm install`` command. - **Option 2**: Connect to an existing Prometheus server. + If you don't have enough permissions to install a new Prometheus server, + you can connect to an existing one. If ``prometheus.external.enabled`` is set to ``true``, + all parameters in the ``prometheus`` section, except ``prometheus.external``, are ignored. + .. code-block:: yaml prometheus: - external: true - url: "" - port: - scheme: "https" # or "http" + external: + enabled: true + scheme: "https" # or "http" + url: "" + port: +8. (optional) Configure metrics for scaling and rate limiting +=============================================================== + Both the rate limiter and the autoscaler are currently configured to use the same Prometheus metric and threshold. -They are defined in the ``prometheus.serverLoadMetric`` and ``prometheus.serverLoadThreshold`` parameters in the values file. +They are defined in the ``serverLoadMetric`` and ``serverLoadThreshold`` parameters in the root level of the values file. The default metric is the inference queue time at the Triton servers, as defined in -`here `_. +`here `_. When the metric value exceeds the threshold, the following happens: - Autoscaler scales up the number of Triton servers if possible. - Envoy proxy rejects new ``RepositoryIndex`` requests. The pre-configured Grafana dashboard contains a graph of this metric, entitled "Server Load Metric". -The Prometheus query for the graph is automatically inferred from the value of ``prometheus.serverLoadMetric`` parameter. -The graph also displays the threshold value defined in ``prometheus.serverLoadThreshold`` parameter. +The Prometheus query for the graph is automatically inferred from the value of ``serverLoadMetric`` parameter. +The graph also displays the threshold value defined in ``serverLoadThreshold`` parameter. -Grafana Configuration -**************************************** -7. Configure Grafana dashboard +9. (optional) Deploy Grafana dashboard ========================================== Grafana is used to visualize metrics collected by Prometheus. We provide a pre-configured Grafana dashboard which includes many useful metrics, including latency breakdown, GPU utilization, and more. +Grafana is installed as a subchart with most of the default values pre-configured. +You can further customize the Grafana installation by passing parameters from +official Grafana `values.yaml `_ file +under the ``grafana`` section of the SuperSONIC values file: + .. code-block:: yaml grafana: enabled: true - ingress: - enabled: true - hostName: "" + +The values you will most likely need to configure in your values file are related to +Grafana Ingress for web access, and datasources to connect to Prometheus, -Autoscaler Configuration -**************************************** -8. (optional) Enable KEDA autoscaler +10. (optional) Enable KEDA autoscaler ========================================== Autoscaling is implemented via `KEDA (Kubernetes Event-Driven Autoscaler) `_ and can be enabled via the ``autoscaler.enabled`` parameter in the values file. +.. warning:: + + Deploying KEDA autoscaler requires KEDA CustomResourceDefinitions to be installed in the cluster. + Please contact cluster administrators if this step of installation fails. + The parameters ``autoscaler.minReplicas`` and ``autoscaler.maxReplicas`` define the range in which the number of Triton servers can scale. @@ -286,14 +319,14 @@ Additional optional parameters can control how quickly the autoscaler reacts to autoscaler: enabled: true - minReplicas: 1 - maxReplicas: 10 + minReplicaCount: 1 + maxReplicaCount: 10 scaleUp: - window: 120 - period: 30 + stabilizationWindowSeconds: 120 + periodSeconds: 30 stepsize: 1 scaleDown: - window: 120 - period: 30 + stabilizationWindowSeconds: 120 + periodSeconds: 30 stepsize: 1 diff --git a/docs/getting-started.rst b/docs/getting-started.rst index 50442b6e..5486ce0b 100644 --- a/docs/getting-started.rst +++ b/docs/getting-started.rst @@ -6,7 +6,9 @@ Pre-requisites ~~~~~~~~~~~~~~~ 1. `Kubernetes `_ cluster - 2. `KEDA `_ (if using autoscaling) – may require cluster administrator to install CustomResourceDefinitions. + 2. `Helm `_ + 3. Access to an existing Prometheus instance in the cluster, or sufficient permissions to deploy a custom instance (preferred). + 4. `KEDA `_ (if using autoscaling) – may require cluster administrator to install CustomResourceDefinitions. Installation ~~~~~~~~~~~~~~ @@ -17,11 +19,13 @@ Installation - `Configuration reference `_ - `Example values.yaml files `_ - 2. Add FastML Helm repository: + 2. Add Helm repositories for SuperSONIC and dependencies: .. code:: shell helm repo add fastml https://fastmachinelearning.org/SuperSONIC + helm repo add prometheus-community https://prometheus-community.github.io/helm-charts + helm repo add grafana https://grafana.github.io/helm-charts helm repo update 3. Modify the following command to install the chart at your cluster: diff --git a/docs/img/diagram.svg b/docs/img/diagram.svg index 7477bcba..4aee38fa 100644 --- a/docs/img/diagram.svg +++ b/docs/img/diagram.svg @@ -1 +1 @@ - \ No newline at end of file + \ No newline at end of file diff --git a/helm/supersonic/.helmignore b/helm/supersonic/.helmignore index 7bfecee0..505900ad 100644 --- a/helm/supersonic/.helmignore +++ b/helm/supersonic/.helmignore @@ -7,4 +7,4 @@ *.log # Helm packaging -*.tgz +# *.tgz diff --git a/helm/supersonic/Chart.lock b/helm/supersonic/Chart.lock new file mode 100644 index 00000000..20861b33 --- /dev/null +++ b/helm/supersonic/Chart.lock @@ -0,0 +1,9 @@ +dependencies: +- name: grafana + repository: https://grafana.github.io/helm-charts + version: 8.9.0 +- name: prometheus + repository: https://prometheus-community.github.io/helm-charts + version: 27.3.0 +digest: sha256:0e64db91a28a1fed4cd5b8b080af6859681e6e290fb2cc261d59474770d56b16 +generated: "2025-02-11T23:09:12.270167-05:00" diff --git a/helm/supersonic/Chart.yaml b/helm/supersonic/Chart.yaml index be7a7d17..584b68da 100644 --- a/helm/supersonic/Chart.yaml +++ b/helm/supersonic/Chart.yaml @@ -10,3 +10,13 @@ annotations: artifacthub.io/links: | - name: GitHub url: https://github.com/fastmachinelearning/SuperSONIC/ + +dependencies: + - name: grafana + version: "8.9.0" + repository: https://grafana.github.io/helm-charts + condition: grafana.enabled + - name: prometheus + version: "27.3.0" + repository: https://prometheus-community.github.io/helm-charts + condition: prometheus.enabled diff --git a/helm/supersonic/README.md b/helm/supersonic/README.md index 5164ad40..4e38dbe3 100644 --- a/helm/supersonic/README.md +++ b/helm/supersonic/README.md @@ -26,6 +26,8 @@ The main components of SuperSONIC are: ``` helm repo add fastml https://fastmachinelearning.org/SuperSONIC +helm repo add prometheus-community https://prometheus-community.github.io/helm-charts +helm repo add grafana https://grafana.github.io/helm-charts helm repo update helm install fastml/supersonic --values -n ``` diff --git a/helm/supersonic/cfg/envoy-filter.lua b/helm/supersonic/cfg/envoy-filter.lua index 5b8c3b11..0af267cd 100644 --- a/helm/supersonic/cfg/envoy-filter.lua +++ b/helm/supersonic/cfg/envoy-filter.lua @@ -14,8 +14,9 @@ function envoy_on_request(request_handle) local query_response_template = '"value":%[%d+%.%d+,"([%d%.]+)"%]' local encoded_query = encode_query(query) - request_handle:logInfo("Prometheus URL: " .. "PROMETHEUS_URL") request_handle:logInfo("Prometheus scheme: " .. "PROMETHEUS_SCHEME") + request_handle:logInfo("Prometheus host: " .. "PROMETHEUS_HOST") + request_handle:logInfo("Prometheus port: " .. "PROMETHEUS_PORT") request_handle:logInfo("Query: " .. query) request_handle:logInfo("Encoded query: " .. encoded_query) @@ -24,8 +25,8 @@ function envoy_on_request(request_handle) { [":method"] = "GET", [":path"] = "/api/v1/query?query=" .. encoded_query, - [":authority"] = "PROMETHEUS_URL", - [":scheme"] = "PROMETHEUS_SCHEME" + [":scheme"] = "PROMETHEUS_SCHEME", + [":authority"] = "PROMETHEUS_HOST" .. ":" .. "PROMETHEUS_PORT" }, "", 5000 diff --git a/helm/supersonic/dashboards/default.json b/helm/supersonic/dashboards/default.json index 0d62cdb3..a8150d1e 100644 --- a/helm/supersonic/dashboards/default.json +++ b/helm/supersonic/dashboards/default.json @@ -17,7 +17,7 @@ }, "editable": true, "fiscalYearStartMonth": 0, - "graphTooltip": 2, + "graphTooltip": 1, "liveNow": false, "panels": [ { @@ -92,7 +92,7 @@ "colorMode": "value", "graphMode": "none", "justifyMode": "auto", - "orientation": "horizontal", + "orientation": "auto", "reduceOptions": { "calcs": [ "lastNotNull" @@ -112,7 +112,7 @@ }, "disableTextWrap": false, "editorMode": "code", - "expr": "sum by(release)(envoy_cluster_membership_total{envoy_cluster_name=\"triton_grpc_service\", release=~\"${release_name}\"})", + "expr": "sum by(release)(envoy_cluster_membership_total{envoy_cluster_name=\"triton_grpc_service\", release=~\"${release_name}\", namespace=~\"${namespace}\"})", "fullMetaSearch": false, "includeNullMetadata": true, "instant": false, @@ -197,7 +197,7 @@ "calcs": [], "displayMode": "list", "placement": "bottom", - "showLegend": true + "showLegend": false }, "tooltip": { "mode": "multi", @@ -328,7 +328,7 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "sum by (app) ( rate(nv_inference_compute_infer_duration_us{release=~\"${release_name}\"}[15s])) /sum by (app) ( (rate(nv_inference_exec_count{release=~\"${release_name}\"}[15s]) * 1000) + 0.001)", + "expr": "sum (rate(nv_inference_compute_infer_duration_us{release=~\"${release_name}\", namespace=~\"${namespace}\"}[15s])) / sum ((rate(nv_inference_exec_count{release=~\"${release_name}\", namespace=~\"${namespace}\"}[15s]) * 1000))", "instant": false, "legendFormat": "Inference", "range": true, @@ -340,7 +340,7 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "sum by (app) ( rate(nv_inference_queue_duration_us{release=~\"${release_name}\"}[15s])) /sum by (app) ( (rate(nv_inference_exec_count{release=~\"${release_name}\"}[15s]) * 1000) + 0.001)", + "expr": "sum (rate(nv_inference_queue_duration_us{release=~\"${release_name}\", namespace=~\"${namespace}\"}[15s])) / sum ((rate(nv_inference_exec_count{release=~\"${release_name}\", namespace=~\"${namespace}\"}[15s]) * 1000))", "hide": false, "instant": false, "legendFormat": "Queue", @@ -353,7 +353,7 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "sum by (app) ( rate(nv_inference_compute_input_duration_us{release=~\"${release_name}\"}[15s])) /sum by (app) ( (rate(nv_inference_exec_count{release=~\"${release_name}\"}[15s]) * 1000) + 0.001)", + "expr": "sum (rate(nv_inference_compute_input_duration_us{release=~\"${release_name}\", namespace=~\"${namespace}\"}[15s])) / sum ((rate(nv_inference_exec_count{release=~\"${release_name}\", namespace=~\"${namespace}\"}[15s]) * 1000))", "hide": false, "instant": false, "legendFormat": "Input", @@ -366,7 +366,7 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "sum by (app) ( rate(nv_inference_compute_output_duration_us{release=~\"${release_name}\"}[15s])) /sum by (app) ( (rate(nv_inference_exec_count{release=~\"${release_name}\"}[15s]) * 1000) + 0.001)", + "expr": "sum (rate(nv_inference_compute_output_duration_us{release=~\"${release_name}\", namespace=~\"${namespace}\"}[15s])) / sum ((rate(nv_inference_exec_count{release=~\"${release_name}\", namespace=~\"${namespace}\"}[15s]) * 1000))", "hide": false, "instant": false, "legendFormat": "Output", @@ -379,7 +379,7 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": " sum(\n rate(envoy_http_downstream_rq_time_sum{envoy_http_conn_manager_prefix=\"ingress_grpc\", release=~\"${release_name}\"}[15s])\n /\n rate(envoy_http_downstream_rq_time_count{envoy_http_conn_manager_prefix=\"ingress_grpc\", release=~\"${release_name}\"}[15s])\n ) by (app)", + "expr": "sum(rate(envoy_http_downstream_rq_time_sum{envoy_http_conn_manager_prefix=\"ingress_grpc\", release=~\"${release_name}\", namespace=~\"${namespace}\"}[15s])) / sum(rate(envoy_http_downstream_rq_time_count{envoy_http_conn_manager_prefix=\"ingress_grpc\", release=~\"${release_name}\", namespace=~\"${namespace}\"}[15s]))", "hide": false, "instant": false, "legendFormat": "Total (measured at proxy)", @@ -462,7 +462,7 @@ "calcs": [], "displayMode": "list", "placement": "bottom", - "showLegend": true + "showLegend": false }, "tooltip": { "mode": "multi", @@ -477,7 +477,7 @@ }, "disableTextWrap": false, "editorMode": "code", - "expr": "sum by(release)(envoy_cluster_membership_total{envoy_cluster_name=\"triton_grpc_service\", release=~\"${release_name}\"})", + "expr": "sum by(release)(envoy_cluster_membership_total{envoy_cluster_name=\"triton_grpc_service\", release=~\"${release_name}\", namespace=~\"${namespace}\"})", "fullMetaSearch": false, "includeNullMetadata": true, "instant": false, @@ -600,7 +600,7 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "nv_gpu_utilization", + "expr": "nv_gpu_utilization{namespace=~\"${namespace}\"}", "hide": false, "instant": false, "legendFormat": "{{pod}}", @@ -708,7 +708,7 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "nv_gpu_power_usage / nv_gpu_power_limit", + "expr": "(nv_gpu_power_usage{namespace=~\"${namespace}\"} / nv_gpu_power_limit{namespace=~\"${namespace}\"})", "hide": false, "instant": false, "legendFormat": "{{pod}}", @@ -762,6 +762,7 @@ } }, "mappings": [], + "max": 1, "min": 0, "thresholds": { "mode": "absolute", @@ -784,7 +785,7 @@ } ] }, - "unit": "bytes" + "unit": "percentunit" }, "overrides": [] }, @@ -815,7 +816,7 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "nv_gpu_memory_used_bytes", + "expr": "nv_gpu_memory_used_bytes{namespace=~\"${namespace}\"}", "hide": false, "instant": false, "legendFormat": "{{pod}}", @@ -909,7 +910,7 @@ "calcs": [], "displayMode": "list", "placement": "right", - "showLegend": true + "showLegend": false }, "tooltip": { "mode": "multi", @@ -925,7 +926,7 @@ }, "editorMode": "code", "exemplar": false, - "expr": "sum (rate(nv_inference_count{release=~\"${release_name}\"}[15s]))by(release)", + "expr": "sum (rate(nv_inference_count{release=~\"${release_name}\", namespace=~\"${namespace}\"}[15s])) by(release)", "instant": false, "interval": "", "legendFormat": "{{ release }}", @@ -1022,7 +1023,7 @@ }, "editorMode": "code", "exemplar": false, - "expr": "sum (rate(nv_inference_count{release=~\"${release_name}\"}[15s]))by(pod)", + "expr": "sum (rate(nv_inference_count{release=~\"${release_name}\", namespace=~\"${namespace}\"}[15s])) by(pod)", "instant": false, "interval": "", "legendFormat": "{{ pod }}", @@ -1119,7 +1120,7 @@ }, "editorMode": "code", "exemplar": false, - "expr": "sum (rate(nv_inference_count{release=~\"${release_name}\"}[15s]))by(model)", + "expr": "sum (rate(nv_inference_count{release=~\"${release_name}\", namespace=~\"${namespace}\"}[15s])) by(model)", "instant": false, "interval": "", "legendFormat": "{{ model }}", diff --git a/helm/supersonic/dashboards/variables.json b/helm/supersonic/dashboards/variables.json index 935d59c5..9787f381 100644 --- a/helm/supersonic/dashboards/variables.json +++ b/helm/supersonic/dashboards/variables.json @@ -3,15 +3,18 @@ "list": [ { "current": { - "selected": true, - "text": ["supersonic"], - "value": ["supersonic"] + "text": [ + "All" + ], + "value": [ + "$__all" + ] }, "datasource": { "type": "prometheus", "uid": "prometheus" }, - "definition": "label_values(up{app=\"%CHART_NAME%\", namespace=\"%NAMESPACE%\"}, release)", + "definition": "label_values({app=\"%CHART_NAME%\",namespace=\"%NAMESPACE%\"},release)", "hide": 0, "includeAll": true, "label": "Release", @@ -20,7 +23,7 @@ "options": [], "query": { "qryType": 1, - "query": "label_values(up{app=\"%CHART_NAME%\", namespace=\"%NAMESPACE%\"}, release)", + "query": "label_values({app=\"%CHART_NAME%\",namespace=\"%NAMESPACE%\"},release)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 1, @@ -53,6 +56,14 @@ "skipUrlSync": false, "type": "constant" }, + { + "hide": 2, + "label": "Version", + "name": "version", + "query": "%CHART_NAME%", + "skipUrlSync": false, + "type": "constant" + }, { "hide": 2, "label": "Prometheus URL", @@ -60,6 +71,14 @@ "query": "%PROMETHEUS_URL_FULL%", "skipUrlSync": false, "type": "constant" + }, + { + "hide": 2, + "label": "Namespace", + "name": "namespace", + "query": "%NAMESPACE%", + "skipUrlSync": false, + "type": "constant" } ] } diff --git a/helm/supersonic/templates/NOTES.txt b/helm/supersonic/templates/NOTES.txt index 4704f35e..3ffbcf94 100644 --- a/helm/supersonic/templates/NOTES.txt +++ b/helm/supersonic/templates/NOTES.txt @@ -1,4 +1,11 @@ --- +{{- /* Run validation checks */ -}} +{{- include "supersonic.validateGrafana" . -}} +{{- include "supersonic.validateGrafanaAddressConsistency" . -}} +{{- include "supersonic.validateGrafanaValues" . -}} +{{- include "supersonic.validatePrometheus" . -}} +{{- include "supersonic.validatePrometheusAddressConsistency" . -}} +{{- include "supersonic.validatePrometheusValues" . -}} ____ ___ ___ _ _ ___ ___ / __/_ _____ ___ ____/ __|/ _ \| \| |_ _/ __| @@ -16,35 +23,24 @@ SuperSONIC chart successfully installed! | * equal to release name, unless nameOverride is specified. └-----------------------------------------------------------------------------┘ -Scaling metric:{{ if not ( eq .Values.prometheus.serverLoadMetric "" ) }} {{ .Values.prometheus.serverLoadMetric }}{{ else }}{{ include "supersonic.defaultMetric" . | nindent 4 }}{{ end }} +Scaling metric:{{ if not ( eq .Values.serverLoadMetric "" ) }} {{ .Values.serverLoadMetric }}{{ else }}{{ include "supersonic.defaultMetric" . | nindent 4 }}{{ end }} -Scaling threshold: {{ include "supersonic.serverLoadThreshold" . }}{{"\n"}} +Scaling threshold: {{ include "supersonic.defaultThreshold" . }} -{{- if or (and .Values.grafana.enabled (eq (include "supersonic.grafanaExists" .) "true")) (eq (include "supersonic.prometheusExists" .) "true") .Values.prometheus.external }} -┌-----------------------------------------------------------------------------┐ -| NOTICE: Using existing/external monitoring components -{{- if and .Values.grafana.enabled (eq (include "supersonic.grafanaExists" .) "true") }} -| • Re-using existing Grafana instance -{{- end }} -{{- if .Values.prometheus.external }} -| • Using external Prometheus instance -{{- else if (eq (include "supersonic.prometheusExists" .) "true") }} -| • Re-using existing Prometheus instance -{{- end }} -└-----------------------------------------------------------------------------┘ -{{- end }} ┌-----------------------------------------------------------------------------┐ | Documentation: https://fastmachinelearning.org/SuperSONIC | | Uninstall: helm uninstall {{ .Release.Name }} -n {{ .Release.Namespace }} +{{- if .Values.envoy.enabled }} | -{{- if .Values.ingress.enabled }} | gRPC endpoint: {{ include "supersonic.grpcEndpoint" . }} {{- end }} +{{- if or .Values.prometheus.external.enabled .Values.prometheus.enabled (include "supersonic.prometheusExists" . ) }} | -| Prometheus UI: {{ include "supersonic.prometheusUrl" . }} +| Prometheus UI: {{ include "supersonic.prometheusDisplayUrl" . }} +{{- end }} +{{- if or .Values.grafana.enabled (include "supersonic.grafanaExists" .) }} | -{{- if .Values.grafana.enabled }} -| Grafana dashboard: {{ include "supersonic.grafanaUrl" . }} +| Grafana dashboard: {{ include "supersonic.grafanaDisplayUrl" . }} {{- end }} └-----------------------------------------------------------------------------┘ diff --git a/helm/supersonic/templates/_common.tpl b/helm/supersonic/templates/_common.tpl new file mode 100644 index 00000000..61f7b30a --- /dev/null +++ b/helm/supersonic/templates/_common.tpl @@ -0,0 +1,250 @@ +{{/* +Common helper functions for SuperSONIC services +*/}} + +{{/* +Get service scheme - takes service type and values as parameters +*/}} +{{- define "supersonic.common.getServiceScheme" -}} +{{- $serviceType := .serviceType -}} +{{- $values := .values -}} +{{- if eq $serviceType "prometheus" -}} + {{- if $values.prometheus.external.enabled -}} + {{- $values.prometheus.external.scheme -}} + {{- else if $values.prometheus.enabled -}} + {{- if and $values.prometheus.server.ingress.enabled $values.prometheus.server.ingress.tls -}} + {{- printf "https" -}} + {{- else -}} + {{- printf "http" -}} + {{- end -}} + {{- else -}} + {{- printf "https" -}} + {{- end -}} +{{- else if eq $serviceType "grafana" -}} + {{- if $values.grafana.enabled -}} + {{- if $values.grafana.ingress.enabled -}} + {{- if $values.grafana.ingress.tls -}} + {{- printf "https" -}} + {{- else -}} + {{- printf "http" -}} + {{- end -}} + {{- else -}} + {{- printf "http" -}} + {{- end -}} + {{- else -}} + {{- printf "https" -}} + {{- end -}} +{{- else -}} + {{- printf "https" -}} +{{- end -}} +{{- end -}} + +{{/* +Check if service exists - takes service name as parameter +*/}} +{{- define "supersonic.common.serviceExists" -}} +{{- $serviceName := .serviceName -}} +{{- $exists := "" -}} +{{- if (lookup "apps/v1" "Deployment" .root.Release.Namespace "") -}} + {{- range (lookup "apps/v1" "Deployment" .root.Release.Namespace "").items -}} + {{- if eq (index .metadata.labels "app.kubernetes.io/name") $serviceName -}} + {{- $exists = "true" -}} + {{- end -}} + {{- end -}} +{{- end -}} +{{- $exists -}} +{{- end -}} + +{{/* +Get service details - takes service type and root context as parameters +*/}} +{{- define "supersonic.common.getServiceDetails" -}} +{{- $serviceType := .serviceType -}} +{{- $root := .root -}} +{{- $defaultPort := .defaultPort | default "80" -}} +{{- $details := dict "scheme" "http" "port" $defaultPort -}} +{{- $found := false -}} + +{{- /* Try to get details from ingress first */ -}} +{{- if (lookup "networking.k8s.io/v1" "Ingress" $root.Release.Namespace "") -}} + {{- range (lookup "networking.k8s.io/v1" "Ingress" $root.Release.Namespace "").items -}} + {{- if eq (index .metadata.labels "app.kubernetes.io/name") $serviceType -}} + {{- if .spec.rules -}} + {{- $details = merge $details (dict "host" (index .spec.rules 0).host) -}} + {{- if .spec.tls -}} + {{- $details = merge $details (dict "scheme" "https" "port" "443") -}} + {{- else -}} + {{- $details = merge $details (dict "port" "80") -}} + {{- end -}} + {{- $found = true -}} + {{- end -}} + {{- end -}} + {{- end -}} +{{- end -}} + +{{- /* Fall back to service if ingress not found */ -}} +{{- if (not $found) -}} + {{- if (lookup "v1" "Service" $root.Release.Namespace "") -}} + {{- range (lookup "v1" "Service" $root.Release.Namespace "").items -}} + {{- if eq (index .metadata.labels "app.kubernetes.io/name") $serviceType -}} + {{- $details = merge $details (dict "host" (printf "%s.%s.svc.cluster.local" .metadata.name $root.Release.Namespace)) -}} + {{- end -}} + {{- end -}} + {{- end -}} +{{- end -}} + +{{- $details | toJson -}} +{{- end -}} + +{{/* +Validate service address consistency +*/}} +{{- define "supersonic.common.validateAddressConsistency" -}} +{{- $serviceType := .serviceType -}} +{{- $values := .values -}} +{{- $root := .root -}} + +{{- if eq $serviceType "prometheus" -}} + {{- if and $values.prometheus.enabled $values.prometheus.server.ingress.enabled -}} + {{- /* Extract and validate ingress host */ -}} + {{- if not $values.prometheus.server.ingress.hosts -}} + {{- fail "Parameter missing: prometheus.server.ingress.hosts" -}} + {{- end -}} + {{- $ingressHost := first $values.prometheus.server.ingress.hosts -}} + + {{- /* Validate TLS host if TLS is enabled */ -}} + {{- if $values.prometheus.server.ingress.tls -}} + {{- if not (first $values.prometheus.server.ingress.tls).hosts -}} + {{- fail "Parameter missing: prometheus.server.ingress.tls[0].hosts" -}} + {{- end -}} + {{- $tlsHost := first (first $values.prometheus.server.ingress.tls).hosts -}} + {{- if ne $ingressHost $tlsHost -}} + {{- fail (printf "Mismatched configuration. For internal consistency of SuperSONIC components, please set the following parameter:\nprometheus.server.ingress.tls[0].hosts[0]: %s" $ingressHost) -}} + {{- end -}} + {{- end -}} + {{- end -}} +{{- else if eq $serviceType "grafana" -}} + {{- if and $values.grafana.enabled $values.grafana.ingress.enabled -}} + {{- /* Extract and validate ingress host */ -}} + {{- if not $values.grafana.ingress.hosts -}} + {{- fail "Parameter missing: grafana.ingress.hosts" -}} + {{- end -}} + {{- $ingressHost := first $values.grafana.ingress.hosts -}} + + {{- /* Validate TLS host if TLS is enabled */ -}} + {{- if $values.grafana.ingress.tls -}} + {{- if not (first $values.grafana.ingress.tls).hosts -}} + {{- fail "Parameter missing: grafana.ingress.tls[0].hosts" -}} + {{- end -}} + {{- $tlsHost := first (first $values.grafana.ingress.tls).hosts -}} + {{- if ne $ingressHost $tlsHost -}} + {{- fail (printf "Mismatched configuration. For internal consistency of SuperSONIC components, please set the following parameter:\ngrafana.ingress.tls[0].hosts[0]: %s" $ingressHost) -}} + {{- end -}} + {{- end -}} + + {{- /* Validate root_url if specified */ -}} + {{- if (index $values.grafana "grafana.ini").server.root_url -}} + {{- $rootUrl := (index $values.grafana "grafana.ini").server.root_url -}} + {{- $expectedRootUrl := printf "https://%s" $ingressHost -}} + {{- if ne $rootUrl $expectedRootUrl -}} + {{- fail (printf "Mismatched configuration. For internal consistency of SuperSONIC components, please set the following parameter:\ngrafana.grafana.ini.server.root_url: %s" $expectedRootUrl) -}} + {{- end -}} + {{- end -}} + {{- end -}} +{{- end -}} +{{- end -}} + +{{/* +Get service name with prefix +*/}} +{{- define "supersonic.common.getServiceName" -}} +{{- $serviceName := .serviceName -}} +{{- printf "%s-%s" (include "supersonic.name" .root) $serviceName | trunc 63 | trimSuffix "-" -}} +{{- end -}} + +{{/* +Get service display URL (without standard ports) +*/}} +{{- define "supersonic.common.getServiceDisplayUrl" -}} +{{- $scheme := .scheme -}} +{{- $host := .host -}} +{{- printf "%s://%s" $scheme $host -}} +{{- end -}} + +{{/* +Validate no existing service instance when enabling a new one +*/}} +{{- define "supersonic.common.validateNoExistingService" -}} +{{- $serviceType := .serviceType -}} +{{- $values := .values -}} +{{- $root := .root -}} +{{- if and (eq $serviceType "prometheus") (not $values.prometheus.external.enabled) -}} + {{- if include "supersonic.common.serviceExists" (dict "serviceName" $serviceType "root" $root) -}} + {{- $details := fromJson (include "supersonic.common.getServiceDetails" (dict "serviceType" $serviceType "root" $root)) -}} + {{- $url := printf "%s://%s" $details.scheme $details.host -}} + {{- fail (printf "Error: Found existing %s instance in the namespace:\n- Namespace: %s\n- URL: %s\n\nTo proceed, either:\n1. Set %s.enabled=false in values.yaml to use existing instance, OR\n2. Uninstall the existing instance" $serviceType $root.Release.Namespace $url $serviceType) -}} + {{- end -}} +{{- end -}} +{{- end -}} + +{{/* +Get full service URL +*/}} +{{- define "supersonic.common.getServiceUrl" -}} +{{- $scheme := .scheme -}} +{{- $host := .host -}} +{{- $port := .port -}} +{{- printf "%s://%s:%s" $scheme $host $port -}} +{{- end -}} + +{{/* +Get existing service details by type +*/}} +{{- define "supersonic.common.getExistingServiceDetails" -}} +{{- $serviceType := .serviceType -}} +{{- $root := .root -}} +{{- $defaultPort := "" -}} +{{- if eq $serviceType "prometheus" -}} + {{- $defaultPort = "9090" -}} +{{- else if eq $serviceType "grafana" -}} + {{- $defaultPort = "80" -}} +{{- end -}} +{{- include "supersonic.common.getServiceDetails" (dict "serviceType" $serviceType "root" $root "defaultPort" $defaultPort) -}} +{{- end -}} + +{{/* +Get existing service scheme +*/}} +{{- define "supersonic.common.getExistingServiceScheme" -}} +{{- $details := fromJson (include "supersonic.common.getExistingServiceDetails" .) -}} +{{- $details.scheme -}} +{{- end -}} + +{{/* +Get existing service host +*/}} +{{- define "supersonic.common.getExistingServiceHost" -}} +{{- $details := fromJson (include "supersonic.common.getExistingServiceDetails" .) -}} +{{- $details.host -}} +{{- end -}} + +{{/* +Get existing service port +*/}} +{{- define "supersonic.common.getExistingServicePort" -}} +{{- $details := fromJson (include "supersonic.common.getExistingServiceDetails" .) -}} +{{- $details.port -}} +{{- end -}} + +{{/* +Get existing service URL +*/}} +{{- define "supersonic.common.getExistingServiceUrl" -}} +{{- $serviceType := .serviceType -}} +{{- $values := .values -}} +{{- if eq $serviceType "prometheus" -}} + {{- $values.prometheus.existingUrl -}} +{{- else if eq $serviceType "grafana" -}} + {{- $values.grafana.existingUrl -}} +{{- end -}} +{{- end -}} \ No newline at end of file diff --git a/helm/supersonic/templates/_grafana.tpl b/helm/supersonic/templates/_grafana.tpl index 82db4a95..99aec449 100644 --- a/helm/supersonic/templates/_grafana.tpl +++ b/helm/supersonic/templates/_grafana.tpl @@ -2,74 +2,125 @@ Get Grafana name */}} {{- define "supersonic.grafanaName" -}} -{{- printf "%s-grafana" (include "supersonic.name" .) | trunc 63 | trimSuffix "-" -}} +{{- include "supersonic.common.getServiceName" (dict "serviceName" "grafana" "root" .) -}} {{- end -}} {{/* -Check if Grafana exists in the namespace (from any release) +Get Grafana scheme */}} -{{- define "supersonic.grafanaExists" -}} -{{- $root := . -}} -{{- $exists := false -}} -{{- if (lookup "v1" "Service" .Release.Namespace "") -}} - {{- range (lookup "v1" "Service" .Release.Namespace "").items -}} - {{- if and (eq (index .metadata.labels "app.kubernetes.io/name") "supersonic") - (eq (index .metadata.labels "app.kubernetes.io/component") "grafana") - (ne (index .metadata.labels "app.kubernetes.io/instance") (include "supersonic.name" $root))}} - {{- $exists = true -}} - {{- break -}} +{{- define "supersonic.grafanaScheme" -}} +{{- include "supersonic.common.getServiceScheme" (dict "serviceType" "grafana" "values" .Values) -}} +{{- end -}} + +{{/* +Get Grafana host +*/}} +{{- define "supersonic.grafanaHost" -}} +{{- if .Values.grafana.enabled -}} + {{- if and .Values.grafana.ingress.enabled .Values.grafana.ingress.hosts -}} + {{- first .Values.grafana.ingress.hosts -}} + {{- else -}} + {{- printf "%s-grafana.%s.svc.cluster.local" .Release.Name .Release.Namespace -}} {{- end -}} - {{- end -}} +{{- else -}} + {{- include "supersonic.common.getExistingServiceHost" (dict "serviceType" "grafana" "root" .) -}} {{- end -}} -{{- $exists -}} {{- end -}} {{/* -Get existing Grafana service name (from any release) +Get Grafana port */}} -{{- define "supersonic.existingGrafanaName" -}} -{{- $root := . -}} -{{- range (lookup "v1" "Service" .Release.Namespace "").items }} - {{- if and (eq (index .metadata.labels "app.kubernetes.io/name") "supersonic") - (eq (index .metadata.labels "app.kubernetes.io/component") "grafana") - (ne (index .metadata.labels "app.kubernetes.io/instance") (include "supersonic.name" $root))}} - {{- .metadata.name -}} - {{- break }} - {{- end }} -{{- end }} +{{- define "supersonic.grafanaPort" -}} +{{- if .Values.grafana.enabled -}} + {{- if .Values.grafana.ingress.enabled -}} + {{- if .Values.grafana.ingress.tls -}} + {{- printf "443" -}} + {{- else -}} + {{- printf "80" -}} + {{- end -}} + {{- else -}} + {{- .Values.grafana.service.port | default "80" -}} + {{- end -}} +{{- else -}} + {{- include "supersonic.common.getExistingServicePort" (dict "serviceType" "grafana" "root" .) -}} +{{- end -}} {{- end -}} {{/* -Get Grafana URL (handles ingress, existing, and new instances) +Get full Grafana URL */}} {{- define "supersonic.grafanaUrl" -}} -{{- if and .Values.grafana.ingress.enabled .Values.grafana.ingress.hostName -}} -https://{{ .Values.grafana.ingress.hostName }} -{{- else -}} - {{- $foundIngress := false -}} - {{- if (lookup "networking.k8s.io/v1" "Ingress" .Release.Namespace "") -}} - {{- $root := . -}} - {{- range (lookup "networking.k8s.io/v1" "Ingress" .Release.Namespace "").items -}} - {{- if and (eq (index .metadata.labels "app.kubernetes.io/name") "supersonic") - (eq (index .metadata.labels "app.kubernetes.io/component") "grafana") - (ne (index .metadata.labels "app.kubernetes.io/instance") (include "supersonic.name" $root))}} - {{- range .spec.rules -}} - {{- if .host -}} - {{- $foundIngress = true -}} -https://{{ .host }} - {{- break -}} +{{- include "supersonic.common.getServiceUrl" (dict "scheme" (include "supersonic.grafanaScheme" .) "host" (include "supersonic.grafanaHost" .) "port" (include "supersonic.grafanaPort" .)) -}} +{{- end -}} + +{{/* +Check if Grafana exists in the namespace +*/}} +{{- define "supersonic.grafanaExists" -}} +{{- include "supersonic.common.serviceExists" (dict "serviceName" "grafana" "root" .) -}} +{{- end -}} + +{{/* +Validate that there is no existing Grafana instance when enabling a new one +*/}} +{{- define "supersonic.validateGrafana" -}} +{{- if .Values.grafana.enabled -}} + {{- if include "supersonic.grafanaExists" . -}} + {{- $details := fromJson (include "supersonic.common.getExistingServiceDetails" (dict "serviceType" "grafana" "root" .)) -}} + {{- $url := printf "%s://%s:%s" $details.scheme $details.host $details.port -}} + {{- fail (printf "Error: Found existing Grafana instance in the namespace:\n- Namespace: %s\n- URL: %s\n\nTo proceed, either:\n1. Set grafana.enabled=false in values.yaml to use the existing Grafana instance, OR\n2. Uninstall the existing Grafana instance" .Release.Namespace $url) -}} + {{- end -}} +{{- end -}} +{{- end -}} + +{{/* +Validate Grafana address consistency +*/}} +{{- define "supersonic.validateGrafanaAddressConsistency" -}} +{{- include "supersonic.common.validateAddressConsistency" (dict "serviceType" "grafana" "values" .Values "root" .) -}} +{{- end -}} + +{{/* +Validate Grafana configuration values +*/}} +{{- define "supersonic.validateGrafanaValues" -}} +{{- if .Values.grafana.enabled -}} + {{- $releaseName := include "supersonic.name" . -}} + {{- $root := . -}} + + {{- /* Validate default dashboard name */ -}} + {{- if .Values.grafana.dashboardsConfigMaps -}} + {{- $configMapName := .Values.grafana.dashboardsConfigMaps.default -}} + {{- $expectedName := printf "%s-grafana-default-dashboard" $releaseName -}} + {{- if ne $configMapName $expectedName -}} + {{- fail (printf "Mismatched configuration. For internal consistency of SuperSONIC components, please set the following parameter:\ngrafana.dashboardsConfigMaps.default: %s" $expectedName) -}} + {{- end -}} + {{- end -}} + + {{- /* Validate Prometheus datasource URL */ -}} + {{- if .Values.grafana.datasources -}} + {{- range (index .Values.grafana.datasources "datasources.yaml").datasources -}} + {{- if and (eq .type "prometheus") .url -}} + {{- if $root.Values.prometheus.external.enabled -}} + {{- $expectedURL := printf "%s://%s" $root.Values.prometheus.external.scheme $root.Values.prometheus.external.url -}} + {{- if ne .url $expectedURL -}} + {{- fail (printf "Mismatched configuration. For internal consistency of SuperSONIC components with external Prometheus, please set the following parameter:\ngrafana:\n datasources:\n datasources.yaml:\n datasources:\n - name: prometheus\n type: prometheus\n access: proxy\n url: %s" $expectedURL) -}} + {{- end -}} + {{- else if $root.Values.prometheus.enabled -}} + {{- $expectedURL := printf "http://%s-prometheus-server:9090" $releaseName -}} + {{- if ne .url $expectedURL -}} + {{- fail (printf "Mismatched configuration. For internal consistency of SuperSONIC components with internal Prometheus, please set the following parameter:\ngrafana:\n datasources:\n datasources.yaml:\n datasources:\n - name: prometheus\n type: prometheus\n access: proxy\n url: %s" $expectedURL) -}} {{- end -}} {{- end -}} - {{- break -}} {{- end -}} {{- end -}} {{- end -}} - {{- if not $foundIngress -}} - {{- if (eq (include "supersonic.grafanaExists" .) "true") -}} -http://{{ include "supersonic.existingGrafanaName" . }}.{{ .Release.Namespace }}.svc.cluster.local - {{- else -}} -http://{{ include "supersonic.grafanaName" . }}.{{ .Release.Namespace }}.svc.cluster.local - {{- end -}} - {{- end -}} {{- end -}} -{{- end -}} \ No newline at end of file +{{- end -}} + +{{/* +Get full Grafana URL for display (without standard ports) +*/}} +{{- define "supersonic.grafanaDisplayUrl" -}} +{{- include "supersonic.common.getServiceDisplayUrl" (dict "scheme" (include "supersonic.grafanaScheme" .) "host" (include "supersonic.grafanaHost" .)) -}} +{{- end -}} \ No newline at end of file diff --git a/helm/supersonic/templates/_helpers.tpl b/helm/supersonic/templates/_helpers.tpl index 2293fdce..d76ffbfe 100644 --- a/helm/supersonic/templates/_helpers.tpl +++ b/helm/supersonic/templates/_helpers.tpl @@ -1,7 +1,7 @@ {{- /* templates/_helpers.tpl */ -}} {{/* -Get release name (or override) +Get instance name (equal to release name unless overridden) */}} {{- define "supersonic.name" -}} {{- if .Values.nameOverride }} @@ -12,24 +12,35 @@ Get release name (or override) {{- end -}} {{/* -Get Triton name +Get Triton server name */}} {{- define "supersonic.tritonName" -}} {{- printf "%s-triton" (include "supersonic.name" .) | trunc 63 | trimSuffix "-" -}} {{- end -}} {{/* -Get Envoy name +Get Envoy proxy name */}} {{- define "supersonic.envoyName" -}} {{- printf "%s-envoy" (include "supersonic.name" .) | trunc 63 | trimSuffix "-" -}} {{- end -}} {{/* -Get gRPC endpoint +Get gRPC endpoint for client connections */}} {{- define "supersonic.grpcEndpoint" -}} -{{- if .Values.ingress.enabled -}} -{{ .Values.ingress.hostName }}:443 -{{- end }} -{{- end }} \ No newline at end of file +{{- if .Values.envoy.enabled -}} + {{- if .Values.envoy.ingress.enabled -}} + {{- printf "%s:443" .Values.envoy.ingress.hostName -}} + {{- else -}} + {{- $serviceName := include "supersonic.name" . -}} + {{- $grpcPort := 8001 -}} + {{- range .Values.envoy.service.ports -}} + {{- if eq .name "grpc" -}} + {{- $grpcPort = .port -}} + {{- end -}} + {{- end -}} + {{- printf "%s.%s.svc.cluster.local:%d" $serviceName .Release.Namespace $grpcPort -}} + {{- end -}} +{{- end -}} +{{- end -}} diff --git a/helm/supersonic/templates/_prometheus.tpl b/helm/supersonic/templates/_prometheus.tpl index 023bedfd..84cd90af 100644 --- a/helm/supersonic/templates/_prometheus.tpl +++ b/helm/supersonic/templates/_prometheus.tpl @@ -2,95 +2,135 @@ Get Prometheus name */}} {{- define "supersonic.prometheusName" -}} -{{- printf "%s-prometheus" (include "supersonic.name" .) | trunc 63 | trimSuffix "-" -}} +{{- include "supersonic.common.getServiceName" (dict "serviceName" "prometheus" "root" .) -}} {{- end -}} {{/* -Check if Prometheus exists in the namespace (from any release) +Get Prometheus scheme */}} -{{- define "supersonic.prometheusExists" -}} -{{- $root := . -}} -{{- $exists := false -}} -{{- if (lookup "v1" "Service" .Release.Namespace "") -}} - {{- range (lookup "v1" "Service" .Release.Namespace "").items -}} - {{- if and (eq (index .metadata.labels "app.kubernetes.io/name") "supersonic") - (eq (index .metadata.labels "app.kubernetes.io/component") "prometheus") - (ne (index .metadata.labels "app.kubernetes.io/instance") (include "supersonic.name" $root))}} - {{- $exists = true -}} - {{- break -}} +{{- define "supersonic.prometheusScheme" -}} +{{- include "supersonic.common.getServiceScheme" (dict "serviceType" "prometheus" "values" .Values) -}} +{{- end -}} + +{{/* +Get Prometheus host +*/}} +{{- define "supersonic.prometheusHost" -}} +{{- if .Values.prometheus.external.enabled -}} + {{- .Values.prometheus.external.url -}} +{{- else if .Values.prometheus.enabled -}} + {{- if and .Values.prometheus.server.ingress.enabled .Values.prometheus.server.ingress.hosts -}} + {{- first .Values.prometheus.server.ingress.hosts -}} + {{- else -}} + {{- printf "%s-prometheus-server.%s.svc.cluster.local" (include "supersonic.name" .) .Release.Namespace -}} {{- end -}} - {{- end -}} +{{- else -}} + {{- include "supersonic.common.getExistingServiceHost" (dict "serviceType" "prometheus" "root" .) -}} {{- end -}} -{{- $exists -}} {{- end -}} {{/* -Get existing Prometheus service name (from any release) +Get Prometheus port */}} -{{- define "supersonic.existingPrometheusName" -}} -{{- $root := . -}} -{{- range (lookup "v1" "Service" .Release.Namespace "").items }} - {{- if and (eq (index .metadata.labels "app.kubernetes.io/name") "supersonic") - (eq (index .metadata.labels "app.kubernetes.io/component") "prometheus") - (ne (index .metadata.labels "app.kubernetes.io/instance") (include "supersonic.name" $root))}} - {{- .metadata.name -}} - {{- break }} - {{- end }} -{{- end }} +{{- define "supersonic.prometheusPort" -}} +{{- if .Values.prometheus.external.enabled -}} + {{- .Values.prometheus.external.port -}} +{{- else if .Values.prometheus.enabled -}} + {{- if and .Values.prometheus.server.ingress.enabled .Values.prometheus.server.ingress.tls -}} + {{- printf "443" -}} + {{- else if .Values.prometheus.server.ingress.enabled -}} + {{- printf "80" -}} + {{- else -}} + {{- .Values.prometheus.server.service.servicePort | default "9090" -}} + {{- end -}} +{{- else -}} + {{- include "supersonic.common.getExistingServicePort" (dict "serviceType" "prometheus" "root" .) -}} +{{- end -}} {{- end -}} {{/* -Get Prometheus URL (handles external, ingress, existing, and new instances) +Get full Prometheus URL */}} {{- define "supersonic.prometheusUrl" -}} -{{- if .Values.prometheus.external -}} - {{- if and .Values.prometheus.url .Values.prometheus.scheme -}} -{{ .Values.prometheus.scheme }}://{{ .Values.prometheus.url }} - {{- else -}} -http://{{ include "supersonic.prometheusName" . }}.{{ .Release.Namespace }}.svc.cluster.local:9090 - {{- end -}} -{{- else if and .Values.prometheus.ingress.enabled .Values.prometheus.ingress.hostName -}} -https://{{ .Values.prometheus.ingress.hostName }} -{{- else -}} - {{- $foundIngress := false -}} - {{- if (lookup "networking.k8s.io/v1" "Ingress" .Release.Namespace "") -}} - {{- $root := . -}} - {{- range (lookup "networking.k8s.io/v1" "Ingress" .Release.Namespace "").items -}} - {{- if and (eq (index .metadata.labels "app.kubernetes.io/name") "supersonic") - (eq (index .metadata.labels "app.kubernetes.io/component") "prometheus") - (ne (index .metadata.labels "app.kubernetes.io/instance") (include "supersonic.name" $root))}} - {{- range .spec.rules -}} - {{- if .host -}} - {{- $foundIngress = true -}} -https://{{ .host }} - {{- break -}} - {{- end -}} - {{- end -}} - {{- break -}} - {{- end -}} - {{- end -}} - {{- end -}} - {{- if not $foundIngress -}} - {{- if (eq (include "supersonic.prometheusExists" .) "true") -}} -http://{{ include "supersonic.existingPrometheusName" . }}.{{ .Release.Namespace }}.svc.cluster.local:9090 - {{- else -}} -http://{{ include "supersonic.prometheusName" . }}.{{ .Release.Namespace }}.svc.cluster.local:9090 - {{- end -}} - {{- end -}} +{{- include "supersonic.common.getServiceUrl" (dict "scheme" (include "supersonic.prometheusScheme" .) "host" (include "supersonic.prometheusHost" .) "port" (include "supersonic.prometheusPort" .)) -}} {{- end -}} + +{{/* +Check if Prometheus exists in the namespace +*/}} +{{- define "supersonic.prometheusExists" -}} +{{- include "supersonic.common.serviceExists" (dict "serviceName" "prometheus" "root" .) -}} +{{- end -}} + +{{/* +Validate that there is no existing Prometheus instance when enabling a new one +*/}} +{{- define "supersonic.validatePrometheus" -}} +{{- include "supersonic.common.validateNoExistingService" (dict "serviceType" "prometheus" "values" .Values "root" .) -}} {{- end -}} {{/* Validate RBAC permissions for Prometheus */}} {{- define "supersonic.validateRBACPermissions" -}} -{{- if not .Values.prometheus.external -}} +{{- if and (not .Values.prometheus.external) (not (include "supersonic.prometheusExists" .)) -}} {{- $canReadRoles := false -}} {{- if (lookup "rbac.authorization.k8s.io/v1" "Role" .Release.Namespace "") -}} {{- $canReadRoles = true -}} {{- end -}} {{- if not $canReadRoles -}} - {{- fail "\nError: Failed to install Prometheus due to lack of permissions to get 'roles' in API group 'rbac.authorization.k8s.io'.\nEither:\n1. Set prometheus.external=true in value.yaml and provide an external Prometheus URL, or\n2. Request necessary RBAC permissions from your cluster administrator." -}} + {{- fail "\nError: Failed to install Prometheus due to lack of permissions to get 'roles' in API group 'rbac.authorization.k8s.io'.\nEither:\n1. Set prometheus.external=true in values.yaml and provide an external Prometheus URL, or\n2. Use an existing Prometheus instance in the namespace, or\n3. Request necessary RBAC permissions from your cluster administrator." -}} + {{- end -}} +{{- end -}} +{{- end -}} + +{{/* +Validate Prometheus configuration values +*/}} +{{- define "supersonic.validatePrometheusValues" -}} +{{- $releaseName := include "supersonic.name" . -}} + +{{- if .Values.prometheus.enabled -}} + {{- /* Validate cluster role name */ -}} + {{- if .Values.prometheus.server.useExistingClusterRoleName -}} + {{- $expectedRole := printf "%s-prometheus-role" $releaseName -}} + {{- if ne .Values.prometheus.server.useExistingClusterRoleName $expectedRole -}} + {{- fail (printf "Mismatched configuration. For internal consistency of SuperSONIC components, please set the following parameter:\nprometheus.server.useExistingClusterRoleName: %s" $expectedRole) -}} + {{- end -}} + {{- end -}} + + {{- /* Validate service account name */ -}} + {{- if .Values.prometheus.serviceAccounts.server.name -}} + {{- $expectedSA := printf "%s-prometheus-sa" $releaseName -}} + {{- if ne .Values.prometheus.serviceAccounts.server.name $expectedSA -}} + {{- fail (printf "Mismatched configuration. For internal consistency of SuperSONIC components, please set the following parameter:\nprometheus.serviceAccounts.server.name: %s" $expectedSA) -}} + {{- end -}} + {{- end -}} +{{- end -}} + +{{- /* Validate Prometheus server URL in datasources */ -}} +{{- if .Values.grafana.enabled -}} + {{- range (index .Values.grafana "datasources.yaml").datasources -}} + {{- if and (eq .type "prometheus") .url -}} + {{- $expectedURL := printf "http://%s-prometheus-server:9090" $releaseName -}} + {{- if ne .url $expectedURL -}} + {{- fail (printf "Mismatched configuration. For internal consistency of SuperSONIC components, please set the following parameter:\ngrafana.datasources.yaml.datasources[].url: %s" $expectedURL) -}} + {{- end -}} + {{- end -}} {{- end -}} {{- end -}} -{{- end -}} \ No newline at end of file +{{- end -}} + +{{/* +Validate Prometheus address consistency +*/}} +{{- define "supersonic.validatePrometheusAddressConsistency" -}} +{{- include "supersonic.common.validateAddressConsistency" (dict "serviceType" "prometheus" "values" .Values "root" .) -}} +{{- end -}} + +{{/* +Get full Prometheus URL for display (without standard ports) +*/}} +{{- define "supersonic.prometheusDisplayUrl" -}} +{{- include "supersonic.common.getServiceDisplayUrl" (dict "scheme" (include "supersonic.prometheusScheme" .) "host" (include "supersonic.prometheusHost" .)) -}} +{{- end -}} diff --git a/helm/supersonic/templates/_scaling-metric.tpl b/helm/supersonic/templates/_scaling-metric.tpl index e20ae916..da9e054d 100644 --- a/helm/supersonic/templates/_scaling-metric.tpl +++ b/helm/supersonic/templates/_scaling-metric.tpl @@ -2,8 +2,8 @@ Get default scaling metric */}} {{- define "supersonic.defaultMetric" -}} -{{- if not ( eq .Values.prometheus.serverLoadMetric "" ) }} - {{- printf "%s" .Values.prometheus.serverLoadMetric -}} +{{- if not ( eq .Values.serverLoadMetric "" ) }} + {{- printf "%s" .Values.serverLoadMetric -}} {{- else }} sum by (release) ( rate(nv_inference_queue_duration_us{release=~"{{ include "supersonic.name" . }}"}[15s]) @@ -18,6 +18,6 @@ sum by (release) ( {{/* Get server load threshold (defaults to 100 if not set) */}} -{{- define "supersonic.serverLoadThreshold" -}} -{{- default 100 .Values.prometheus.serverLoadThreshold -}} +{{- define "supersonic.defaultThreshold" -}} +{{- default 100 .Values.serverLoadThreshold -}} {{- end -}} \ No newline at end of file diff --git a/helm/supersonic/templates/grafana-dashboard.yaml b/helm/supersonic/templates/default-dashboard.yaml similarity index 86% rename from helm/supersonic/templates/grafana-dashboard.yaml rename to helm/supersonic/templates/default-dashboard.yaml index 1d86d9ac..6ea5d2a1 100644 --- a/helm/supersonic/templates/grafana-dashboard.yaml +++ b/helm/supersonic/templates/default-dashboard.yaml @@ -2,10 +2,9 @@ apiVersion: v1 kind: ConfigMap metadata: - name: {{ include "supersonic.grafanaName" . }}-dashboard + name: {{ .Release.Name }}-grafana-default-dashboard labels: - app.kubernetes.io/name: {{ .Chart.Name }} - app.kubernetes.io/instance: {{ include "supersonic.name" . }} + grafana_dashboard: "1" app.kubernetes.io/component: grafana data: default.json: |- @@ -25,14 +24,12 @@ data: {{- /* Replace template variables with actual values */ -}} {{- $dashboard := $dashboard | replace "%RELEASE_NAME%" .Release.Name }} + {{- $dashboard := $dashboard | replace "%CHART_NAME%" .Chart.Name }} {{- $dashboard := $dashboard | replace "%CHART_VERSION%" .Chart.Version }} + {{- $dashboard := $dashboard | replace "%NAMESPACE%" .Release.Namespace }} {{- $dashboard := $dashboard | replace "%SERVER_LOAD_METRIC%" $metric }} {{- $dashboard := $dashboard | replace "%SERVER_LOAD_THRESHOLD%" $threshold }} {{- $dashboard := $dashboard | replace "%PROMETHEUS_URL_FULL%" $prometheus_url }} {{- $dashboard := $dashboard | replace "%HEADER%" $header }} - {{- $dashboard := $dashboard | replace "%CHART_NAME%" .Chart.Name }} - {{- $dashboard := $dashboard | replace "%NAMESPACE%" .Release.Namespace }} {{ $dashboard | nindent 4 }} - triton.json: |- - {{- $.Files.Get "dashboards/triton.json" | nindent 4 }} -{{- end }} +{{- end }} \ No newline at end of file diff --git a/helm/supersonic/templates/envoy-configmaps.yaml b/helm/supersonic/templates/envoy-configmaps.yaml index 20e9e85d..65ac7493 100644 --- a/helm/supersonic/templates/envoy-configmaps.yaml +++ b/helm/supersonic/templates/envoy-configmaps.yaml @@ -97,14 +97,9 @@ static_resources: - endpoint: address: socket_address: - {{- if $.prometheus.external }} - address: {{ $.prometheus.url }} - port_value: {{ $.prometheus.port }} - {{- else }} - address: {{ printf "%s-prometheus.%s.svc.cluster.local" (include "supersonic.name" $.root) $.root.Release.Namespace }} - port_value: 9090 - {{- end }} - {{- if and $.prometheus.external (eq $.prometheus.scheme "https") }} + address: {{ include "supersonic.prometheusHost" $.root }} + port_value: {{ include "supersonic.prometheusPort" $.root }} + {{- if eq (include "supersonic.prometheusScheme" $.root) "https" }} transport_socket: name: envoy.transport_sockets.tls typed_config: @@ -125,8 +120,8 @@ static_resources: - endpoint: address: socket_address: - address: {{- .url }} - port_value: {{- .port }} + address: {{ .url }} + port_value: {{ .port }} transport_socket: name: envoy.transport_sockets.tls typed_config: @@ -212,17 +207,10 @@ data: {{- /* Read and process the Lua configuration file */}} {{- $luaConfig := $.Files.Get .Values.envoy.rate_limiter.prometheus_based.luaConfig | nindent 4 }} {{- $luaConfig = $luaConfig | replace "SERVER_LOAD_METRIC" (include "supersonic.defaultMetric" . | quote) }} - {{- $luaConfig = $luaConfig | replace "SERVER_LOAD_THRESHOLD" (quote .Values.prometheus.serverLoadThreshold) }} - {{- if .Values.prometheus.external }} - {{- $luaConfig = $luaConfig | replace "PROMETHEUS_URL" .Values.prometheus.url }} - {{- $luaConfig = $luaConfig | replace "PROMETHEUS_SCHEME" .Values.prometheus.scheme }} - {{- else if (eq (include "supersonic.prometheusExists" .) "true") }} - {{- $luaConfig = $luaConfig | replace "PROMETHEUS_URL" (printf "%s.%s.svc.cluster.local" (include "supersonic.existingPrometheusName" .) .Release.Namespace) }} - {{- $luaConfig = $luaConfig | replace "PROMETHEUS_SCHEME" "http" }} - {{- else }} - {{- $luaConfig = $luaConfig | replace "PROMETHEUS_URL" (printf "%s.%s.svc.cluster.local" (include "supersonic.prometheusName" .) .Release.Namespace) }} - {{- $luaConfig = $luaConfig | replace "PROMETHEUS_SCHEME" "http" }} - {{- end }} + {{- $luaConfig = $luaConfig | replace "SERVER_LOAD_THRESHOLD" (quote .Values.serverLoadThreshold) }} + {{- $luaConfig = $luaConfig | replace "PROMETHEUS_SCHEME" (include "supersonic.prometheusScheme" .) }} + {{- $luaConfig = $luaConfig | replace "PROMETHEUS_HOST" (include "supersonic.prometheusHost" .) }} + {{- $luaConfig = $luaConfig | replace "PROMETHEUS_PORT" (include "supersonic.prometheusPort" .) }} {{ $luaConfig | indent 4 }} --- diff --git a/helm/supersonic/templates/envoy-ingress.yaml b/helm/supersonic/templates/envoy-ingress.yaml new file mode 100644 index 00000000..b15834ee --- /dev/null +++ b/helm/supersonic/templates/envoy-ingress.yaml @@ -0,0 +1,50 @@ +{{/* Validate that the ingress host is not already in use in the current namespace */}} +{{- if and .Values.envoy.ingress.enabled .Values.envoy.ingress.hostName -}} + {{- $hostName := .Values.envoy.ingress.hostName -}} + {{- $namespace := .Release.Namespace -}} + {{- $currentName := (include "supersonic.name" .) -}} + {{- $existingIngresses := (lookup "networking.k8s.io/v1" "Ingress" $namespace "").items -}} + {{- range $ingress := $existingIngresses -}} + {{- if not (hasPrefix (printf "%s-" $currentName) $ingress.metadata.name) -}} + {{- range .spec.rules -}} + {{- if eq .host $hostName -}} + {{- fail (printf "Error: Ingress host %q is already in use by ingress %q in namespace %q" $hostName $ingress.metadata.name $namespace) -}} + {{- end -}} + {{- end -}} + {{- end -}} + {{- end -}} +{{- end }} + +{{ if .Values.envoy.ingress.enabled | default false }} + +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: {{ include "supersonic.name" . }}-ingress-grpc + namespace: {{ .Release.Namespace }} + labels: + app.kubernetes.io/name: {{ .Chart.Name }} + app.kubernetes.io/instance: {{ include "supersonic.name" . }} + annotations: + {{- if .Values.envoy.ingress.annotations }} +{{ toYaml .Values.envoy.ingress.annotations | nindent 4 }} + {{- end }} + +spec: + ingressClassName: {{ .Values.envoy.ingress.ingressClassName }} + tls: + - hosts: + - {{ .Values.envoy.ingress.hostName }} + rules: + - host: {{ .Values.envoy.ingress.hostName }} + http: + paths: + - backend: + service: + name: {{ include "supersonic.name" . }} + port: + number: 8001 + path: / + pathType: ImplementationSpecific + +{{ end }} \ No newline at end of file diff --git a/helm/supersonic/templates/grafana-config.yaml b/helm/supersonic/templates/grafana-config.yaml deleted file mode 100644 index 65dfb5c7..00000000 --- a/helm/supersonic/templates/grafana-config.yaml +++ /dev/null @@ -1,48 +0,0 @@ -{{- if and .Values.grafana.enabled (ne (include "supersonic.grafanaExists" .) "true") }} -apiVersion: v1 -kind: ConfigMap -metadata: - name: {{ include "supersonic.grafanaName" . }}-datasources - labels: - app.kubernetes.io/name: {{ .Chart.Name }} - app.kubernetes.io/instance: {{ include "supersonic.name" . }} - app.kubernetes.io/component: grafana -data: - datasources.yaml: |- - apiVersion: 1 - datasources: - - name: Prometheus - type: prometheus - uid: prometheus - access: proxy - url: {{ if and .Values.prometheus.external .Values.prometheus.url -}} - {{- .Values.prometheus.scheme }}://{{ .Values.prometheus.url -}} - {{- else -}} - http://{{ include "supersonic.prometheusName" . }}:9090 - {{- end }} - isDefault: true ---- -apiVersion: v1 -kind: ConfigMap -metadata: - name: {{ include "supersonic.grafanaName" . }}-dashboards-config - labels: - app.kubernetes.io/name: {{ .Chart.Name }} - app.kubernetes.io/instance: {{ include "supersonic.name" . }} - app.kubernetes.io/component: grafana -data: - dashboards.yaml: |- - apiVersion: 1 - providers: - - name: 'default' - orgId: 1 - folder: '' - type: file - disableDeletion: false - editable: true - updateIntervalSeconds: 10 - allowUiUpdates: true - options: - path: /var/lib/grafana/dashboards - foldersFromFilesStructure: false -{{- end }} \ No newline at end of file diff --git a/helm/supersonic/templates/grafana-deployment.yaml b/helm/supersonic/templates/grafana-deployment.yaml deleted file mode 100644 index b468efe4..00000000 --- a/helm/supersonic/templates/grafana-deployment.yaml +++ /dev/null @@ -1,72 +0,0 @@ -{{- if and .Values.grafana.enabled (ne (include "supersonic.grafanaExists" .) "true") }} -apiVersion: apps/v1 -kind: Deployment -metadata: - name: {{ include "supersonic.grafanaName" . }} - labels: - app.kubernetes.io/name: {{ .Chart.Name }} - app.kubernetes.io/instance: {{ include "supersonic.name" . }} - app.kubernetes.io/component: grafana -spec: - replicas: 1 - selector: - matchLabels: - app.kubernetes.io/name: {{ .Chart.Name }} - app.kubernetes.io/instance: {{ include "supersonic.name" . }} - app.kubernetes.io/component: grafana - template: - metadata: - labels: - app.kubernetes.io/name: {{ .Chart.Name }} - app.kubernetes.io/instance: {{ include "supersonic.name" . }} - app.kubernetes.io/component: grafana - spec: - containers: - - name: grafana - image: "grafana/grafana:10.2.3" - imagePullPolicy: IfNotPresent - ports: - - name: http - containerPort: 3000 - protocol: TCP - env: - - name: GF_AUTH_ANONYMOUS_ENABLED - value: "true" - - name: GF_AUTH_ANONYMOUS_ORG_ROLE - value: "Admin" - - name: GF_AUTH_DISABLE_LOGIN_FORM - value: "true" - - name: GF_AUTH_BASIC_ENABLED - value: "false" - - name: GF_PATHS_PROVISIONING - value: /etc/grafana/provisioning - - name: GF_DASHBOARDS_DEFAULT_HOME_DASHBOARD_PATH - value: "/var/lib/grafana/dashboards/default.json" - volumeMounts: - - name: config - mountPath: /etc/grafana/provisioning/datasources - readOnly: true - - name: dashboards-config - mountPath: /etc/grafana/provisioning/dashboards - readOnly: true - - name: dashboards - mountPath: /var/lib/grafana/dashboards - readOnly: true - resources: - limits: - cpu: 1 - memory: 1Gi - requests: - cpu: 100m - memory: 128Mi - volumes: - - name: config - configMap: - name: {{ include "supersonic.grafanaName" . }}-datasources - - name: dashboards-config - configMap: - name: {{ include "supersonic.grafanaName" . }}-dashboards-config - - name: dashboards - configMap: - name: {{ include "supersonic.grafanaName" . }}-dashboard -{{- end }} \ No newline at end of file diff --git a/helm/supersonic/templates/grafana-ingress.yaml b/helm/supersonic/templates/grafana-ingress.yaml deleted file mode 100644 index c361a5ef..00000000 --- a/helm/supersonic/templates/grafana-ingress.yaml +++ /dev/null @@ -1,31 +0,0 @@ -{{- if and .Values.grafana.enabled (ne (include "supersonic.grafanaExists" .) "true") .Values.grafana.ingress.enabled }} -apiVersion: networking.k8s.io/v1 -kind: Ingress -metadata: - name: {{ include "supersonic.grafanaName" . }} - namespace: {{ .Release.Namespace }} - labels: - app.kubernetes.io/name: {{ .Chart.Name }} - app.kubernetes.io/instance: {{ include "supersonic.name" . }} - app.kubernetes.io/component: grafana - annotations: - {{- if .Values.grafana.ingress.annotations }} -{{ toYaml .Values.grafana.ingress.annotations | nindent 4 }} - {{- end }} -spec: - ingressClassName: {{ .Values.grafana.ingress.ingressClassName }} - tls: - - hosts: - - {{ .Values.grafana.ingress.hostName }} - rules: - - host: {{ .Values.grafana.ingress.hostName }} - http: - paths: - - backend: - service: - name: {{ include "supersonic.grafanaName" . }} - port: - number: 80 - path: / - pathType: ImplementationSpecific -{{- end }} \ No newline at end of file diff --git a/helm/supersonic/templates/grafana-service.yaml b/helm/supersonic/templates/grafana-service.yaml deleted file mode 100644 index cf1fca64..00000000 --- a/helm/supersonic/templates/grafana-service.yaml +++ /dev/null @@ -1,21 +0,0 @@ -{{- if and .Values.grafana.enabled (ne (include "supersonic.grafanaExists" .) "true") }} -apiVersion: v1 -kind: Service -metadata: - name: {{ include "supersonic.grafanaName" . }} - labels: - app.kubernetes.io/name: {{ .Chart.Name }} - app.kubernetes.io/instance: {{ include "supersonic.name" . }} - app.kubernetes.io/component: grafana -spec: - type: ClusterIP - ports: - - port: 80 - targetPort: http - protocol: TCP - name: http - selector: - app.kubernetes.io/name: {{ .Chart.Name }} - app.kubernetes.io/instance: {{ include "supersonic.name" . }} - app.kubernetes.io/component: grafana -{{- end }} \ No newline at end of file diff --git a/helm/supersonic/templates/ingress.yaml b/helm/supersonic/templates/ingress.yaml deleted file mode 100644 index b5cc7010..00000000 --- a/helm/supersonic/templates/ingress.yaml +++ /dev/null @@ -1,33 +0,0 @@ -{{ if .Values.ingress.enabled | default false }} - -apiVersion: networking.k8s.io/v1 -kind: Ingress -metadata: - name: {{ include "supersonic.name" . }}-ingress-grpc - namespace: {{ .Release.Namespace }} - labels: - app.kubernetes.io/name: {{ .Chart.Name }} - app.kubernetes.io/instance: {{ include "supersonic.name" . }} - annotations: - {{- if .Values.ingress.annotations }} -{{ toYaml .Values.ingress.annotations | nindent 4 }} - {{- end }} - -spec: - ingressClassName: {{ .Values.ingress.ingressClassName }} - tls: - - hosts: - - {{ .Values.ingress.hostName }} - rules: - - host: {{ .Values.ingress.hostName }} - http: - paths: - - backend: - service: - name: {{ include "supersonic.name" . }} - port: - number: 8001 - path: / - pathType: ImplementationSpecific - -{{ end }} \ No newline at end of file diff --git a/helm/supersonic/templates/keda-so.yaml b/helm/supersonic/templates/keda-so.yaml index ec84e866..8207f592 100644 --- a/helm/supersonic/templates/keda-so.yaml +++ b/helm/supersonic/templates/keda-so.yaml @@ -20,21 +20,21 @@ spec: {{- if (eq .Values.autoscaler.zeroIdleReplicas true) }} idleReplicaCount: 0 {{- end }} - minReplicaCount: {{ default 1 .Values.autoscaler.minReplicas }} - maxReplicaCount: {{ default 14 .Values.autoscaler.maxReplicas }} + minReplicaCount: {{ default 1 .Values.autoscaler.minReplicaCount }} + maxReplicaCount: {{ default 14 .Values.autoscaler.maxReplicaCount }} advanced: horizontalPodAutoscalerConfig: behavior: scaleDown: - stabilizationWindowSeconds: {{ default 120 $scaleDown.window }} + stabilizationWindowSeconds: {{ default 120 $scaleDown.stabilizationWindowSeconds }} policies: - - periodSeconds: {{ default 30 $scaleDown.period }} + - periodSeconds: {{ default 30 $scaleDown.periodSeconds }} type: Pods value: {{ default 1 $scaleDown.stepsize }} scaleUp: - stabilizationWindowSeconds: {{ default 120 $scaleUp.window }} + stabilizationWindowSeconds: {{ default 120 $scaleUp.stabilizationWindowSeconds }} policies: - - periodSeconds: {{ default 30 $scaleUp.period }} + - periodSeconds: {{ default 30 $scaleUp.periodSeconds }} type: Pods value: {{ default 1 $scaleUp.stepsize }} @@ -44,7 +44,7 @@ spec: metadata: serverAddress: {{ include "supersonic.prometheusUrl" . }} metricName: autoscaler-metric - threshold: {{ .Values.prometheus.serverLoadThreshold | quote }} + threshold: {{ .Values.serverLoadThreshold | quote }} query: |- {{ include "supersonic.defaultMetric" . | nindent 8 }} --- diff --git a/helm/supersonic/templates/prometheus-configmap.yaml b/helm/supersonic/templates/prometheus-configmap.yaml index d3f0320b..703534aa 100644 --- a/helm/supersonic/templates/prometheus-configmap.yaml +++ b/helm/supersonic/templates/prometheus-configmap.yaml @@ -1,4 +1,4 @@ -{{- if and (not .Values.prometheus.external) (ne (include "supersonic.prometheusExists" .) "true") }} +{{- if and .Values.prometheus.enabled (not .Values.prometheus.external.enabled) }} apiVersion: v1 kind: ConfigMap metadata: @@ -10,45 +10,53 @@ metadata: data: prometheus.yml: | global: - scrape_interval: 5s - evaluation_interval: 5s + scrape_interval: {{ .Values.prometheus.server.global.scrape_interval }} + evaluation_interval: {{ .Values.prometheus.server.global.evaluation_interval }} scrape_configs: - - job_name: 'monitoring/{{ include "supersonic.name" . }}/0' - honor_timestamps: true - scheme: http - follow_redirects: true - enable_http2: true + - job_name: "{{ include "supersonic.tritonName" . }}" kubernetes_sd_configs: - - role: endpoints + - role: pod namespaces: names: - "{{ .Release.Namespace }}" + metrics_path: /metrics relabel_configs: - # Filter by app name only - - source_labels: [__meta_kubernetes_service_label_app_kubernetes_io_name] - regex: {{ .Chart.Name }} + - source_labels: [__meta_kubernetes_pod_label_app_kubernetes_io_component] + regex: "triton" action: keep - # Set metrics path based on component - - source_labels: [__meta_kubernetes_service_label_app_kubernetes_io_component] - action: replace - target_label: __metrics_path__ - regex: envoy - replacement: /stats/prometheus - - source_labels: [__meta_kubernetes_service_label_app_kubernetes_io_component] - action: replace - target_label: __metrics_path__ - regex: triton - replacement: /metrics - # Add standard labels + - source_labels: [__meta_kubernetes_pod_container_port_number] + action: keep + regex: "8002" + - source_labels: [__meta_kubernetes_pod_name] + target_label: pod + - source_labels: [__meta_kubernetes_pod_label_app_kubernetes_io_instance] + target_label: release + - source_labels: [__meta_kubernetes_pod_label_app_kubernetes_io_name] + target_label: app - source_labels: [__meta_kubernetes_namespace] target_label: namespace + + - job_name: "{{ include "supersonic.envoyName" . }}" + kubernetes_sd_configs: + - role: pod + namespaces: + names: + - "{{ .Release.Namespace }}" + metrics_path: /stats/prometheus + relabel_configs: + - source_labels: [__meta_kubernetes_pod_label_app_kubernetes_io_component] + regex: "envoy" + action: keep + - source_labels: [__meta_kubernetes_pod_container_port_number] + regex: "9901" + action: keep - source_labels: [__meta_kubernetes_pod_name] target_label: pod - - source_labels: [__meta_kubernetes_service_label_app_kubernetes_io_component] - target_label: component - - source_labels: [__meta_kubernetes_service_label_app_kubernetes_io_instance] + - source_labels: [__meta_kubernetes_pod_label_app_kubernetes_io_instance] target_label: release - - target_label: app - replacement: {{ .Chart.Name }} + - source_labels: [__meta_kubernetes_pod_label_app_kubernetes_io_name] + target_label: app + - source_labels: [__meta_kubernetes_namespace] + target_label: namespace {{- end }} diff --git a/helm/supersonic/templates/prometheus-deployment.yaml b/helm/supersonic/templates/prometheus-deployment.yaml deleted file mode 100644 index a54f3f03..00000000 --- a/helm/supersonic/templates/prometheus-deployment.yaml +++ /dev/null @@ -1,56 +0,0 @@ -{{- include "supersonic.validateRBACPermissions" . -}} -{{- if and (not .Values.prometheus.external) (ne (include "supersonic.prometheusExists" .) "true") }} -apiVersion: apps/v1 -kind: Deployment -metadata: - name: {{ include "supersonic.prometheusName" . }} - labels: - app.kubernetes.io/name: {{ .Chart.Name }} - app.kubernetes.io/instance: {{ include "supersonic.name" . }} - app.kubernetes.io/component: prometheus -spec: - replicas: 1 - selector: - matchLabels: - app.kubernetes.io/name: {{ .Chart.Name }} - app.kubernetes.io/instance: {{ include "supersonic.name" . }} - app.kubernetes.io/component: prometheus - template: - metadata: - labels: - app.kubernetes.io/name: {{ .Chart.Name }} - app.kubernetes.io/instance: {{ include "supersonic.name" . }} - app.kubernetes.io/component: prometheus - spec: - serviceAccountName: {{ include "supersonic.prometheusName" . }}-sa - containers: - - name: prometheus - image: prom/prometheus:v2.49.1 - args: - - "--config.file=/etc/prometheus/prometheus.yml" - - "--storage.tsdb.path=/prometheus" - - "--web.console.libraries=/usr/share/prometheus/console_libraries" - - "--web.console.templates=/usr/share/prometheus/consoles" - ports: - - containerPort: 9090 - name: http - volumeMounts: - - name: prometheus-config - mountPath: /etc/prometheus - - name: prometheus-storage - mountPath: /prometheus - resources: - requests: - cpu: 500m - memory: 512Mi - limits: - cpu: 1 - memory: 1Gi - volumes: - - name: prometheus-config - configMap: - name: {{ include "supersonic.prometheusName" . }}-config - - name: prometheus-storage - emptyDir: {} - restartPolicy: Always -{{- end }} diff --git a/helm/supersonic/templates/prometheus-ingress.yaml b/helm/supersonic/templates/prometheus-ingress.yaml deleted file mode 100644 index ad62e281..00000000 --- a/helm/supersonic/templates/prometheus-ingress.yaml +++ /dev/null @@ -1,31 +0,0 @@ -{{- if and (not .Values.prometheus.external) (ne (include "supersonic.prometheusExists" .) "true") .Values.prometheus.ingress.enabled }} -apiVersion: networking.k8s.io/v1 -kind: Ingress -metadata: - name: {{ include "supersonic.prometheusName" . }} - namespace: {{ .Release.Namespace }} - labels: - app.kubernetes.io/name: {{ .Chart.Name }} - app.kubernetes.io/instance: {{ include "supersonic.name" . }} - app.kubernetes.io/component: prometheus - annotations: - {{- if .Values.prometheus.ingress.annotations }} -{{ toYaml .Values.prometheus.ingress.annotations | nindent 4 }} - {{- end }} -spec: - ingressClassName: {{ .Values.prometheus.ingress.ingressClassName }} - tls: - - hosts: - - {{ .Values.prometheus.ingress.hostName }} - rules: - - host: {{ .Values.prometheus.ingress.hostName }} - http: - paths: - - backend: - service: - name: {{ include "supersonic.prometheusName" . }} - port: - number: 9090 - path: / - pathType: ImplementationSpecific -{{- end }} \ No newline at end of file diff --git a/helm/supersonic/templates/prometheus-rbac.yaml b/helm/supersonic/templates/prometheus-rbac.yaml index 137a2619..05b8da06 100644 --- a/helm/supersonic/templates/prometheus-rbac.yaml +++ b/helm/supersonic/templates/prometheus-rbac.yaml @@ -1,4 +1,4 @@ -{{- if and (not .Values.prometheus.external) (ne (include "supersonic.prometheusExists" .) "true") }} +{{- if and .Values.prometheus.enabled (not .Values.prometheus.external.enabled) }} apiVersion: rbac.authorization.k8s.io/v1 kind: Role metadata: diff --git a/helm/supersonic/templates/prometheus-service.yaml b/helm/supersonic/templates/prometheus-service.yaml deleted file mode 100644 index a23fc3e3..00000000 --- a/helm/supersonic/templates/prometheus-service.yaml +++ /dev/null @@ -1,21 +0,0 @@ -{{- if and (not .Values.prometheus.external) (ne (include "supersonic.prometheusExists" .) "true") }} -apiVersion: v1 -kind: Service -metadata: - name: {{ include "supersonic.prometheusName" . }} - labels: - app.kubernetes.io/name: {{ .Chart.Name }} - app.kubernetes.io/instance: {{ include "supersonic.name" . }} - app.kubernetes.io/component: prometheus -spec: - type: {{ if not .Values.prometheus.ingress.enabled }}LoadBalancer{{ else }}ClusterIP{{ end }} - ports: - - port: 9090 - targetPort: http - protocol: TCP - name: http - selector: - app.kubernetes.io/name: {{ .Chart.Name }} - app.kubernetes.io/instance: {{ include "supersonic.name" . }} - app.kubernetes.io/component: prometheus -{{- end }} diff --git a/helm/supersonic/values.schema.json b/helm/supersonic/values.schema.json index a0a3b422..987abac3 100644 --- a/helm/supersonic/values.schema.json +++ b/helm/supersonic/values.schema.json @@ -5,6 +5,12 @@ "nameOverride": { "type": "string" }, + "serverLoadMetric": { + "type": "string" + }, + "serverLoadThreshold": { + "type": "integer" + }, "triton": { "type": "object", "properties": { @@ -232,6 +238,29 @@ "type" ] }, + "ingress": { + "type": "object", + "properties": { + "enabled": { + "type": "boolean" + }, + "hostName": { + "type": "string" + }, + "ingressClassName": { + "type": "string" + }, + "annotations": { + "type": "object" + } + }, + "required": [ + "annotations", + "enabled", + "hostName", + "ingressClassName" + ] + }, "grpc_route_timeout": { "type": "string" }, @@ -323,6 +352,7 @@ "enabled", "grpc_route_timeout", "image", + "ingress", "loadBalancerPolicy", "rate_limiter", "replicas", @@ -393,95 +423,602 @@ "zeroIdleReplicas" ] }, + "nodeSelector": { + "type": "object" + }, + "tolerations": { + "type": "array" + }, "prometheus": { "type": "object", "properties": { "external": { + "type": "object", + "properties": { + "enabled": { + "type": "boolean" + }, + "url": { + "type": "string" + }, + "port": { + "type": "integer" + }, + "scheme": { + "type": "string" + } + }, + "required": [ + "enabled", + "port", + "scheme", + "url" + ] + }, + "enabled": { "type": "boolean" }, - "url": { - "type": "string" + "server": { + "type": "object", + "properties": { + "useExistingClusterRoleName": { + "type": "string" + }, + "releaseNamespace": { + "type": "boolean" + }, + "persistentVolume": { + "type": "object", + "properties": { + "enabled": { + "type": "boolean" + } + }, + "required": [ + "enabled" + ] + }, + "resources": { + "type": "object", + "properties": { + "requests": { + "type": "object", + "properties": { + "cpu": { + "type": "string" + }, + "memory": { + "type": "string" + } + }, + "required": [ + "cpu", + "memory" + ] + }, + "limits": { + "type": "object", + "properties": { + "cpu": { + "type": "integer" + }, + "memory": { + "type": "string" + } + }, + "required": [ + "cpu", + "memory" + ] + } + }, + "required": [ + "limits", + "requests" + ] + }, + "retention": { + "type": "string" + }, + "global": { + "type": "object", + "properties": { + "scrape_interval": { + "type": "string" + }, + "evaluation_interval": { + "type": "string" + } + }, + "required": [ + "evaluation_interval", + "scrape_interval" + ] + }, + "service": { + "type": "object", + "properties": { + "enabled": { + "type": "boolean" + }, + "servicePort": { + "type": "integer" + } + }, + "required": [ + "enabled", + "servicePort" + ] + }, + "configMapOverrideName": { + "type": "string" + }, + "ingress": { + "type": "object", + "properties": { + "enabled": { + "type": "boolean" + }, + "hosts": { + "type": "array" + }, + "ingressClassName": { + "type": "string" + }, + "annotations": { + "type": "object" + }, + "tls": { + "type": "array", + "items": { + "type": "object", + "properties": { + "hosts": { + "type": "array" + } + }, + "required": [ + "hosts" + ] + } + } + }, + "required": [ + "annotations", + "enabled", + "hosts", + "ingressClassName", + "tls" + ] + } + }, + "required": [ + "configMapOverrideName", + "global", + "ingress", + "persistentVolume", + "releaseNamespace", + "resources", + "retention", + "service", + "useExistingClusterRoleName" + ] }, - "port": { - "type": "integer" + "serviceAccounts": { + "type": "object", + "properties": { + "server": { + "type": "object", + "properties": { + "create": { + "type": "boolean" + }, + "name": { + "type": "string" + } + }, + "required": [ + "create", + "name" + ] + } + }, + "required": [ + "server" + ] }, - "scheme": { - "type": "string" + "rbac": { + "type": "object", + "properties": { + "create": { + "type": "boolean" + } + }, + "required": [ + "create" + ] }, - "serverLoadMetric": { - "type": "string" + "alertmanager": { + "type": "object", + "properties": { + "enabled": { + "type": "boolean" + } + }, + "required": [ + "enabled" + ] }, - "serverLoadThreshold": { - "type": "integer" + "pushgateway": { + "type": "object", + "properties": { + "enabled": { + "type": "boolean" + } + }, + "required": [ + "enabled" + ] }, - "ingress": { + "kube-state-metrics": { "type": "object", "properties": { "enabled": { "type": "boolean" - }, - "hostName": { - "type": "string" - }, - "ingressClassName": { - "type": "string" - }, - "annotations": { - "type": "object" } }, "required": [ - "annotations", - "enabled", - "hostName", - "ingressClassName" + "enabled" + ] + }, + "prometheus-node-exporter": { + "type": "object", + "properties": { + "enabled": { + "type": "boolean" + } + }, + "required": [ + "enabled" + ] + }, + "prometheus-pushgateway": { + "type": "object", + "properties": { + "enabled": { + "type": "boolean" + } + }, + "required": [ + "enabled" + ] + }, + "configmapReload": { + "type": "object", + "properties": { + "prometheus": { + "type": "object", + "properties": { + "enabled": { + "type": "boolean" + } + }, + "required": [ + "enabled" + ] + } + }, + "required": [ + "prometheus" ] } }, "required": [ + "alertmanager", + "configmapReload", + "enabled", "external", - "ingress", - "port", - "scheme", - "serverLoadMetric", - "serverLoadThreshold", - "url" + "kube-state-metrics", + "prometheus-node-exporter", + "prometheus-pushgateway", + "pushgateway", + "rbac", + "server", + "serviceAccounts" ] }, - "ingress": { + "grafana": { "type": "object", "properties": { "enabled": { "type": "boolean" }, - "hostName": { + "adminUser": { "type": "string" }, - "ingressClassName": { + "adminPassword": { "type": "string" }, - "annotations": { - "type": "object" - } - }, - "required": [ - "annotations", - "enabled", - "hostName", - "ingressClassName" - ] - }, - "nodeSelector": { - "type": "object" - }, - "tolerations": { - "type": "array" - }, - "grafana": { - "type": "object", - "properties": { - "enabled": { - "type": "boolean" + "persistence": { + "type": "object", + "properties": { + "enabled": { + "type": "boolean" + } + }, + "required": [ + "enabled" + ] + }, + "rbac": { + "type": "object", + "properties": { + "create": { + "type": "boolean" + } + }, + "required": [ + "create" + ] + }, + "serviceAccount": { + "type": "object", + "properties": { + "create": { + "type": "boolean" + } + }, + "required": [ + "create" + ] + }, + "datasources": { + "type": "object", + "properties": { + "datasources.yaml": { + "type": "object", + "properties": { + "apiVersion": { + "type": "integer" + }, + "datasources": { + "type": "array", + "items": { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "type": { + "type": "string" + }, + "access": { + "type": "string" + }, + "isDefault": { + "type": "boolean" + }, + "url": { + "type": "string" + }, + "jsonData": { + "type": "object", + "properties": { + "timeInterval": { + "type": "string" + }, + "tlsSkipVerify": { + "type": "boolean" + } + }, + "required": [ + "timeInterval", + "tlsSkipVerify" + ] + } + }, + "required": [ + "access", + "isDefault", + "jsonData", + "name", + "type", + "url" + ] + } + } + }, + "required": [ + "apiVersion", + "datasources" + ] + } + }, + "required": [ + "datasources.yaml" + ] + }, + "dashboardProviders": { + "type": "object", + "properties": { + "dashboardproviders.yaml": { + "type": "object", + "properties": { + "apiVersion": { + "type": "integer" + }, + "providers": { + "type": "array", + "items": { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "orgId": { + "type": "integer" + }, + "folder": { + "type": "string" + }, + "type": { + "type": "string" + }, + "disableDeletion": { + "type": "boolean" + }, + "editable": { + "type": "boolean" + }, + "options": { + "type": "object", + "properties": { + "path": { + "type": "string" + } + }, + "required": [ + "path" + ] + } + }, + "required": [ + "disableDeletion", + "editable", + "folder", + "name", + "options", + "orgId", + "type" + ] + } + } + }, + "required": [ + "apiVersion", + "providers" + ] + } + }, + "required": [ + "dashboardproviders.yaml" + ] + }, + "dashboardsConfigMaps": { + "type": "object", + "properties": { + "default": { + "type": "string" + } + }, + "required": [ + "default" + ] + }, + "grafana.ini": { + "type": "object", + "properties": { + "auth": { + "type": "object", + "properties": { + "disable_login_form": { + "type": "boolean" + } + }, + "required": [ + "disable_login_form" + ] + }, + "auth.anonymous": { + "type": "object", + "properties": { + "enabled": { + "type": "boolean" + }, + "org_role": { + "type": "string" + } + }, + "required": [ + "enabled", + "org_role" + ] + }, + "dashboards": { + "type": "object", + "properties": { + "default_home_dashboard_path": { + "type": "string" + } + }, + "required": [ + "default_home_dashboard_path" + ] + } + }, + "required": [ + "auth", + "auth.anonymous", + "dashboards" + ] + }, + "resources": { + "type": "object", + "properties": { + "limits": { + "type": "object", + "properties": { + "cpu": { + "type": "integer" + }, + "memory": { + "type": "string" + } + }, + "required": [ + "cpu", + "memory" + ] + }, + "requests": { + "type": "object", + "properties": { + "cpu": { + "type": "string" + }, + "memory": { + "type": "string" + } + }, + "required": [ + "cpu", + "memory" + ] + } + }, + "required": [ + "limits", + "requests" + ] + }, + "service": { + "type": "object", + "properties": { + "type": { + "type": "string" + }, + "port": { + "type": "integer" + }, + "targetPort": { + "type": "integer" + } + }, + "required": [ + "port", + "targetPort", + "type" + ] }, "ingress": { "type": "object", @@ -489,12 +1026,21 @@ "enabled": { "type": "boolean" }, - "hostName": { + "path": { + "type": "string" + }, + "pathType": { "type": "string" }, "ingressClassName": { "type": "string" }, + "hosts": { + "type": "array" + }, + "tls": { + "type": "array" + }, "annotations": { "type": "object" } @@ -502,14 +1048,28 @@ "required": [ "annotations", "enabled", - "hostName", - "ingressClassName" + "hosts", + "ingressClassName", + "path", + "pathType", + "tls" ] } }, "required": [ + "adminPassword", + "adminUser", + "dashboardProviders", + "dashboardsConfigMaps", + "datasources", "enabled", - "ingress" + "grafana.ini", + "ingress", + "persistence", + "rbac", + "resources", + "service", + "serviceAccount" ] } }, @@ -517,10 +1077,11 @@ "autoscaler", "envoy", "grafana", - "ingress", "nameOverride", "nodeSelector", "prometheus", + "serverLoadMetric", + "serverLoadThreshold", "tolerations", "triton" ] diff --git a/helm/supersonic/values.yaml b/helm/supersonic/values.yaml index 7c56a34f..a349f33f 100644 --- a/helm/supersonic/values.yaml +++ b/helm/supersonic/values.yaml @@ -3,6 +3,13 @@ # -- Unique identifier of SuperSONIC instance (equal to release name by default) nameOverride: "" +# -- A metric used by both KEDA autoscaler and Envoy's prometheus-based rate limiter. +## Default metric (inference queue latency) is defined in templates/_helpers.tpl +serverLoadMetric: "" + +# -- Threshold for the metric +serverLoadThreshold: 100 + triton: # -- Number of Triton server instances (if autoscaling is disabled) replicas: 1 @@ -102,6 +109,13 @@ envoy: - { name: grpc, port: 8001, targetPort: 8001 } - { name: admin, port: 9901, targetPort: 9901 } + # -- Ingress configuration for Envoy + ingress: + enabled: false + hostName: "" + ingressClassName: "" + annotations: {} + # -- Timeout for gRPC route in Envoy; disabled by default (0s), preventing Envoy from closing connections too early. grpc_route_timeout: 0s @@ -165,57 +179,153 @@ autoscaler: period: 120 stepsize: 1 -# -- Connection to a Prometheus server is required for KEDA autoscaler and Envoy's prometheus-based rate limiter -prometheus: - # -- Whether to use external Prometheus instance (true) or deploy internal one (false) - external: true - - # -- External Prometheus server url and port number (find in documentation of a given cluster or ask admins) - # Only used when external=true - url: "" - port: 443 - - # -- Specify whether external Prometheus endpoint is exposed as http or https - # Only used when external=true - scheme: "https" +# -- Node selector for all pods (Triton and Envoy) +nodeSelector: {} - # -- A metric used by both KEDA autoscaler and Envoy's prometheus-based rate limiter. - ## Default metric (inference queue latency) is defined in templates/_helpers.tpl - serverLoadMetric: "" - - # -- Threshold for the metric - serverLoadThreshold: 100 +# -- Tolerations for all pods (Triton and Envoy) +tolerations: [] - # -- Ingress configuration for internal Prometheus web UI (only used when external=false) - ingress: +# -- Connection to a Prometheus server is required for KEDA autoscaler and Envoy's prometheus-based rate limiter +prometheus: + external: + # -- Enable external Prometheus instance enabled: false - hostName: "" - ingressClassName: "" - annotations: {} + # -- External Prometheus server url + url: "" + # -- External Prometheus server port number + port: 443 + # -- Specify whether external Prometheus endpoint is exposed as http or https + scheme: "https" -ingress: + # -- Enable or disable Prometheus subchart deployment enabled: false - hostName: "" - ingressClassName: "" - annotations: {} -# -- Node selector for all pods (Triton and Envoy) -nodeSelector: {} - -# -- Tolerations for all pods (Triton and Envoy) -tolerations: [] + # -- Prometheus Helm chart configuration (https://github.com/prometheus-community/helm-charts/tree/main/charts/prometheus) + server: + useExistingClusterRoleName: supersonic-prometheus-role + releaseNamespace: true + persistentVolume: + enabled: false + resources: + requests: + cpu: 500m + memory: 512Mi + limits: + cpu: 1 + memory: 1Gi + retention: 15d + global: + scrape_interval: 5s + evaluation_interval: 5s + service: + enabled: true + servicePort: 9090 + configMapOverrideName: prometheus-config + ingress: + enabled: false + hosts: [] + ingressClassName: "" + annotations: {} + tls: + - hosts: [] + + serviceAccounts: + server: + create: false + name: supersonic-prometheus-sa + + rbac: + create: false + alertmanager: + enabled: false + pushgateway: + enabled: false + kube-state-metrics: + enabled: false + prometheus-node-exporter: + enabled: false + prometheus-pushgateway: + enabled: false + configmapReload: + prometheus: + enabled: false -## Grafana Configuration +## Grafana Helm Chart Configuration +## Configuration for the official Grafana Helm chart (https://github.com/grafana/helm-charts) grafana: - # -- Enable or disable Grafana deployment enabled: false - # -- Ingress configuration for Grafana + adminUser: admin + adminPassword: admin + persistence: + enabled: false + rbac: + create: false + serviceAccount: + create: false + + # -- Grafana datasources configuration + datasources: + datasources.yaml: + apiVersion: 1 + datasources: + - name: prometheus + type: prometheus + access: proxy + isDefault: true + url: http://supersonic-prometheus-server:9090 + jsonData: + timeInterval: "5s" + tlsSkipVerify: true + + # -- Grafana dashboard providers configuration + dashboardProviders: + dashboardproviders.yaml: + apiVersion: 1 + providers: + - name: 'default' + orgId: 1 + folder: '' + type: file + disableDeletion: false + editable: true + options: + path: /var/lib/grafana/dashboards/default + + # -- Grafana dashboard ConfigMaps + dashboardsConfigMaps: + default: supersonic-grafana-default-dashboard + + # -- Grafana.ini configuration + grafana.ini: + auth: + disable_login_form: true + auth.anonymous: + enabled: true + org_role: Admin + dashboards: + default_home_dashboard_path: /var/lib/grafana/dashboards/default/default.json + + # -- Resource limits and requests for Grafana + resources: + limits: + cpu: 1 + memory: 1Gi + requests: + cpu: 100m + memory: 128Mi + + # -- Service configuration + service: + type: ClusterIP + port: 80 + targetPort: 3000 + + # -- Ingress configuration ingress: - # -- Enable or disable ingress for Grafana enabled: false - # -- Hostname for Grafana ingress - hostName: "" - # -- Ingress class name (e.g. nginx, haproxy) - ingressClassName: haproxy - # -- Additional annotations for Grafana ingress + path: / + pathType: ImplementationSpecific + ingressClassName: "" + hosts: [] + tls: [] annotations: {} \ No newline at end of file diff --git a/values/values-anvil-cms.yaml b/values/values-anvil-cms.yaml index 41dc6871..7508114e 100644 --- a/values/values-anvil-cms.yaml +++ b/values/values-anvil-cms.yaml @@ -33,15 +33,11 @@ triton: - NVIDIA-A100-SXM4-80GB envoy: enabled: true -# prometheus: -# url: "prometheus.nrp-nautilus.io" -# port: 443 -# scheme: https -# serverLoadThreshold: 10 + ingress: + enabled: true + hostName: sonic-cms.anvilcloud.rcac.purdue.edu + autoscaler: - enabled: False - minReplicas: 1 - maxReplicas: 5 -ingress: - enabled: true - hostName: sonic-cms.anvilcloud.rcac.purdue.edu + enabled: false + minReplicaCount: 1 + maxReplicaCount: 5 diff --git a/values/values-cms-ci.yaml b/values/values-cms-ci.yaml index 89617eb4..5f7c5977 100644 --- a/values/values-cms-ci.yaml +++ b/values/values-cms-ci.yaml @@ -18,6 +18,7 @@ triton: storageType: cvmfs-pvc mountPath: /cvmfs resetReadinessProbe: true + envoy: enabled: true service: @@ -28,13 +29,14 @@ envoy: max_tokens: 5 tokens_per_fill: 1 fill_interval: 12s -prometheus: - external: false + autoscaler: enabled: true - minReplicas: 1 - maxReplicas: 2 -ingress: - enabled: false + minReplicaCount: 1 + maxReplicaCount: 2 + +prometheus: + enabled: true + grafana: - enabled: true \ No newline at end of file + enabled: true diff --git a/values/values-geddes-cms.yaml b/values/values-geddes-cms.yaml index 0fdff09e..5b99ba47 100644 --- a/values/values-geddes-cms.yaml +++ b/values/values-geddes-cms.yaml @@ -1,3 +1,5 @@ +serverLoadThreshold: 100 + triton: # image: fastml/triton-torchgeo:21.02-py3-geometric # run2 image: fastml/triton-torchgeo:22.07-py3-geometric # run3 @@ -25,29 +27,20 @@ triton: enabled: true storageType: cvmfs-pvc mountPath: /cvmfs + envoy: enabled: true loadBalancerPolicy: "ROUND_ROBIN" service: type: LoadBalancer -prometheus: - external: false - serverLoadThreshold: 100 ingress: enabled: true - hostName: prometheus-cms.geddes.rcac.purdue.edu - ingressClassName: public -grafana: - enabled: true - ingress: - enabled: true - hostName: grafana-cms.geddes.rcac.purdue.edu - ingressClassName: public + hostName: sonic-cms.geddes.rcac.purdue.edu autoscaler: enabled: true - minReplicas: 1 - maxReplicas: 7 + minReplicaCount: 1 + maxReplicaCount: 7 ingress: enabled: false @@ -59,3 +52,29 @@ tolerations: operator: Equal value: cms-af effect: NoSchedule + +prometheus: + enabled: true + server: + ingress: + enabled: true + hosts: + - prometheus-cms.geddes.rcac.purdue.edu + tls: + - hosts: + - prometheus-cms.geddes.rcac.purdue.edu + ingressClassName: public + +grafana: + enabled: true + ingress: + enabled: true + hosts: + - grafana-cms.geddes.rcac.purdue.edu + tls: + - hosts: + - grafana-cms.geddes.rcac.purdue.edu + ingressClassName: public + grafana.ini: + server: + root_url: https://grafana-cms.geddes.rcac.purdue.edu diff --git a/values/values-nautilus-atlas.yaml b/values/values-nautilus-atlas.yaml index b15dbad0..078724f9 100644 --- a/values/values-nautilus-atlas.yaml +++ b/values/values-nautilus-atlas.yaml @@ -1,3 +1,5 @@ +serverLoadThreshold: 100 + triton: name: triton-atlas image: milescb/traccc-aas:v1.1 @@ -33,30 +35,64 @@ triton: enabled: true storageType: cvmfs-pvc mountPath: /cvmfs + envoy: enabled: true loadBalancerPolicy: "ROUND_ROBIN" -prometheus: - url: "prometheus.nrp-nautilus.io" - port: 443 - scheme: https - serverLoadThreshold: 100 + ingress: + enabled: true + hostName: atlas.nrp-nautilus.io + ingressClassName: haproxy + annotations: + haproxy-ingress.github.io/cors-enable: "true" + haproxy-ingress.github.io/backend-protocol: "h2" + haproxy-ingress.github.io/proxy-body-size: "512m" + haproxy-ingress.github.io/timeout-client: "5m" + haproxy-ingress.github.io/timeout-server: "5m" + haproxy-ingress.github.io/timeout-connect: "5m" + haproxy-ingress.github.io/timeout-http-request: "5m" + haproxy-ingress.github.io/timeout-queue: "1m" + haproxy-ingress.github.io/health-check-interval: "30s" + haproxy-ingress.github.io/health-check-rise-count: "1" + autoscaler: enabled: False - minReplicas: 0 - maxReplicas: 1 -ingress: + minReplicaCount: 0 + maxReplicaCount: 1 + +prometheus: + external: + enabled: true + url: "prometheus.nrp-nautilus.io" + port: 443 + scheme: https + +grafana: enabled: true - hostName: atlas.nrp-nautilus.io - ingressClassName: haproxy - annotations: - haproxy-ingress.github.io/cors-enable: "true" - haproxy-ingress.github.io/backend-protocol: "h2" - haproxy-ingress.github.io/proxy-body-size: "512m" - haproxy-ingress.github.io/timeout-client: "5m" - haproxy-ingress.github.io/timeout-server: "5m" - haproxy-ingress.github.io/timeout-connect: "5m" - haproxy-ingress.github.io/timeout-http-request: "5m" - haproxy-ingress.github.io/timeout-queue: "1m" - haproxy-ingress.github.io/health-check-interval: "30s" - haproxy-ingress.github.io/health-check-rise-count: "1" \ No newline at end of file + datasources: + datasources.yaml: + apiVersion: 1 + datasources: + - name: prometheus + type: prometheus + access: proxy + isDefault: true + url: https://prometheus.nrp-nautilus.io + jsonData: + timeInterval: "5s" + tlsSkipVerify: true + ingress: + enabled: true + hosts: + - grafana-atlas.nrp-nautilus.io + tls: + - hosts: + - grafana-atlas.nrp-nautilus.io + ingressClassName: haproxy + annotations: + haproxy-ingress.github.io/cors-enable: "true" + haproxy-ingress.github.io/backend-protocol: "h2" + haproxy-ingress.github.io/proxy-body-size: "512m" + grafana.ini: + server: + root_url: https://grafana-atlas.nrp-nautilus.io diff --git a/values/values-nautilus-cms.yaml b/values/values-nautilus-cms.yaml index 76fc3a05..3e0b24ac 100644 --- a/values/values-nautilus-cms.yaml +++ b/values/values-nautilus-cms.yaml @@ -1,3 +1,5 @@ +serverLoadThreshold: 100 + triton: # image: fastml/triton-torchgeo:21.02-py3-geometric # run2 image: fastml/triton-torchgeo:22.07-py3-geometric # run3 @@ -11,53 +13,73 @@ triton: --exit-timeout-secs=60 \ --backend-config=onnxruntime,enable-global-threadpool=1 resources: - limits: { cpu: 2, memory: 4G, nvidia.com/a100: 1} - requests: { cpu: 2, memory: 4G, nvidia.com/a100: 1} + limits: { cpu: 2, memory: 4G, nvidia.com/gpu: 1} + requests: { cpu: 2, memory: 4G, nvidia.com/gpu: 1} modelRepository: enabled: true storageType: cvmfs-pvc mountPath: /cvmfs resetReadinessProbe: true + envoy: enabled: true loadBalancerPolicy: "ROUND_ROBIN" -prometheus: - external: false - url: prometheus.nrp-nautilus.io - port: 443 - scheme: https - serverLoadThreshold: 100 ingress: enabled: true - hostName: prometheus-cms.nrp-nautilus.io + hostName: sonic-cms.nrp-nautilus.io ingressClassName: haproxy + annotations: + haproxy-ingress.github.io/cors-enable: "true" + haproxy-ingress.github.io/backend-protocol: "h2" + haproxy-ingress.github.io/proxy-body-size: "512m" + haproxy-ingress.github.io/timeout-client: "5m" + haproxy-ingress.github.io/timeout-server: "5m" + haproxy-ingress.github.io/timeout-connect: "5m" + haproxy-ingress.github.io/timeout-http-request: "5m" + haproxy-ingress.github.io/timeout-queue: "1m" + haproxy-ingress.github.io/health-check-interval: "30s" + haproxy-ingress.github.io/health-check-rise-count: "1" autoscaler: enabled: true - minReplicas: 1 - maxReplicas: 5 -ingress: - enabled: true - hostName: sonic-cms.nrp-nautilus.io - ingressClassName: haproxy - annotations: - haproxy-ingress.github.io/cors-enable: "true" - haproxy-ingress.github.io/backend-protocol: "h2" - haproxy-ingress.github.io/proxy-body-size: "512m" - haproxy-ingress.github.io/timeout-client: "5m" - haproxy-ingress.github.io/timeout-server: "5m" - haproxy-ingress.github.io/timeout-connect: "5m" - haproxy-ingress.github.io/timeout-http-request: "5m" - haproxy-ingress.github.io/timeout-queue: "1m" - haproxy-ingress.github.io/health-check-interval: "30s" - haproxy-ingress.github.io/health-check-rise-count: "1" + minReplicaCount: 1 + maxReplicaCount: 5 # nodeSelector: # topology.kubernetes.io/zone: ucsd +prometheus: + enabled: true + server: + ingress: + enabled: true + hosts: + - prometheus-cms.nrp-nautilus.io + tls: + - hosts: + - prometheus-cms.nrp-nautilus.io + ingressClassName: haproxy + annotations: + haproxy-ingress.github.io/cors-enable: "true" + haproxy-ingress.github.io/proxy-body-size: "512m" + haproxy-ingress.github.io/timeout-http-request: "5m" + grafana: enabled: true ingress: enabled: true - hostName: grafana-cms.nrp-nautilus.io + hosts: + - grafana-cms.nrp-nautilus.io + tls: + - hosts: + - grafana-cms.nrp-nautilus.io ingressClassName: haproxy + annotations: + haproxy-ingress.github.io/cors-enable: "true" + haproxy-ingress.github.io/proxy-body-size: "512m" + haproxy-ingress.github.io/timeout-http-request: "5m" + grafana.ini: + server: + root_url: https://grafana-cms.nrp-nautilus.io + + diff --git a/values/values-nautilus-icecube.yaml b/values/values-nautilus-icecube.yaml index c7eb77ea..5a6e3193 100644 --- a/values/values-nautilus-icecube.yaml +++ b/values/values-nautilus-icecube.yaml @@ -1,3 +1,5 @@ +serverLoadThreshold: 100 + triton: image: nvcr.io/nvidia/tritonserver:24.06-py3 affinity: @@ -36,6 +38,7 @@ triton: enabled: true storageType: cvmfs-pvc mountPath: /models + envoy: enabled: true auth: @@ -45,27 +48,60 @@ envoy: audiences: [icecube] url: keycloak.icecube.wisc.edu port: 443 -prometheus: - url: "prometheus.nrp-nautilus.io" - port: 443 - scheme: https - serverLoadThreshold: 100 + ingress: + enabled: true + hostName: icesonic.nrp-nautilus.io + ingressClassName: haproxy + annotations: + haproxy-ingress.github.io/cors-enable: "true" + haproxy-ingress.github.io/backend-protocol: "h2" + haproxy-ingress.github.io/proxy-body-size: "512m" + haproxy-ingress.github.io/timeout-client: "5m" + haproxy-ingress.github.io/timeout-server: "5m" + haproxy-ingress.github.io/timeout-connect: "5m" + haproxy-ingress.github.io/timeout-http-request: "5m" + haproxy-ingress.github.io/timeout-queue: "1m" + haproxy-ingress.github.io/health-check-interval: "30s" + haproxy-ingress.github.io/health-check-rise-count: "1" + autoscaler: enabled: false - minReplicas: 0 - maxReplicas: 1 -ingress: + minReplicaCount: 0 + maxReplicaCount: 1 + +prometheus: + external: + enabled: true + url: "prometheus.nrp-nautilus.io" + port: 443 + scheme: https + +grafana: enabled: true - hostName: icesonic.nrp-nautilus.io - ingressClassName: haproxy - annotations: - haproxy-ingress.github.io/cors-enable: "true" - haproxy-ingress.github.io/backend-protocol: "h2" - haproxy-ingress.github.io/proxy-body-size: "512m" - haproxy-ingress.github.io/timeout-client: "5m" - haproxy-ingress.github.io/timeout-server: "5m" - haproxy-ingress.github.io/timeout-connect: "5m" - haproxy-ingress.github.io/timeout-http-request: "5m" - haproxy-ingress.github.io/timeout-queue: "1m" - haproxy-ingress.github.io/health-check-interval: "30s" - haproxy-ingress.github.io/health-check-rise-count: "1" \ No newline at end of file + datasources: + datasources.yaml: + apiVersion: 1 + datasources: + - name: prometheus + type: prometheus + access: proxy + isDefault: true + url: https://prometheus.nrp-nautilus.io + jsonData: + timeInterval: "5s" + tlsSkipVerify: true + ingress: + enabled: true + hosts: + - grafana-icecube.nrp-nautilus.io + tls: + - hosts: + - grafana-icecube.nrp-nautilus.io + ingressClassName: haproxy + annotations: + haproxy-ingress.github.io/cors-enable: "true" + haproxy-ingress.github.io/backend-protocol: "h2" + haproxy-ingress.github.io/proxy-body-size: "512m" + grafana.ini: + server: + root_url: https://grafana-icecube.nrp-nautilus.io