diff --git a/docs/dashboard.md b/docs/dashboard.md index 878d70b0e..b898608b0 100644 --- a/docs/dashboard.md +++ b/docs/dashboard.md @@ -1,54 +1,206 @@ -## Grafana Dashboard +# Abstract -- You can load this dashboard json file [gpu-dashboard.json](./gpu-dashboard.json) +​ For the sake of simplicity, this article provides only one possible way to ultimately use prometheus to capture monitoring metrics as a data source and grafana to present monitoring information. -- This dashboard also includes some NVIDIA DCGM metrics: +​ Many users feedback from creating issues that they do not know how to install and configure related components, resulting in failure to use related dashboard normally. The installation and configuration steps are described as follows, Hope you use it smoothly! Any feedback is welcome. - [dcgm-exporter](https://github.com/NVIDIA/dcgm-exporter) deploy:`kubectl create -f https://raw.githubusercontent.com/NVIDIA/dcgm-exporter/master/dcgm-exporter.yaml` +​ This article assumes that Kubernetes cluster and HAMi has been deployed successfully. The following components are installed in a kubernetes cluster. The components or software versions are as follows: -- use this prometheus custom metric configure: +| components or software name | version | remark | +| --------------------------- | ------------------- | ---------------- | +| kubernetes cluster | v1.23.10 | in AMD64 servers | +| kube-prometheus stack | branch release-0.11 | | +| dcgm-exporter | tag 3.2.5-3.1.7 | | -```yaml -- job_name: 'kubernetes-vgpu-exporter' - kubernetes_sd_configs: - - role: endpoints - relabel_configs: - - source_labels: [__meta_kubernetes_endpoints_name] - regex: vgpu-device-plugin-monitor +# Deploy and configure kube-prometheus stack + +## Deploy kube-prometheus stack + +**Note:**See the version compatibility matrix for kubernetes and kube-prometheus stack in:https://github.com/prometheus-operator/kube-prometheus?tab=readme-ov-file#compatibility + +```shell +#Clone kube-prometheus code repository(using release-0.11 here) +git clone -b release-0.11 https://github.com/prometheus-operator/kube-prometheus.git +cd kube-prometheus + +#Change type of grafana service into NodePort by Adding "type: NodePort" under spec section +vi manifests/grafana-service.yaml +... +spec: + type: NodePort +... +#Similarly, change the prometheus and alertmanager service types to NodePort. Their configuration files are prometheus-service.yaml and alertmanager-service.yaml, respectively, in the manifests directory + +#Do deployment +kubectl create -f manifests/setup/ +kubectl create -f manifests/. + +#All resouce objects are been created under the monitoring namespace, you can check them and their status by run the following command +kubectl -n monitoring get all +``` + +```shell +#Once all resouce objects under the monitoring namespace are in right status, you can obtain svc information for grafana, prometheus, and alertmanager in the following way +root@controller01:~/kube-prometheus# kubectl -n monitoring get svc | egrep "NAME|grafana|prometheus-k8s|alertmanager-main" +NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE +alertmanager-main NodePort 10.233.5.65 9093:30093/TCP,8080:30401/TCP 19h +grafana NodePort 10.233.56.112 3000:30300/TCP 19h +prometheus-k8s NodePort 10.233.38.113 9090:30090/TCP,8080:31273/TCP 19h +``` + +​ If ip address of controller node is 10.0.0.21, then grafana, prometheus, and alertmanager can be accessed using the following urls: http://10.0.0.21:30300 , http://10.0.0.21:30090 , and http://10.0.0.21:30093 , and the default user name and password for accessing grafana are admin + +## Configure grafana + +### Create Datasource ALL + +​ Go to the "Configuration" -> "Data soutces" page in grafana and create a datasource named "ALL", and keep the value of HTTP.URL be same with the counterpart in default "prometheus" datasource. + +### Import dashboard + +​ Go to the "Configuration" -> "Data soutces" page in grafana and import the dashboard from https://grafana.com/grafana/dashboards/22043-hami-vgpu-metrics-dashboard/ , and a dashboard page named "hami-vgpu-metrics-dashboard" will be created. 22043-hami-vgpu-metrics-dashboard is valid in grafana8.5.5 and grafana9.1.0, and it's grealty possible that this dashboard is vaild in grafana version later than 9.1.0. Now data of some panels in this dashboard page are missing, which requires you read the rest of the document. + +​ For versions earlier than grafana8.5.5, such as grafana7.5.17, please refer to:https://grafana.com/grafana/dashboards/21833-hami-vgpu-dashboard/ + +# Deploy dcgm-exporter + +```shell +#Clone dcgm-exporter code repository(the compatibility matrix for dcgm-export and kubernetes is not been founded in its official website, using "tag 3.2.5-3.1.7" here) +git clone -b 3.2.5-3.1.7 https://github.com/NVIDIA/dcgm-exporter.git +cd dcgm-exporter + +#Install dcgm-exporter under monitoring namespace with helm +helm install dcgm-exporter deployment/ -n monitoring + +#Check installation results +root@controller01:~/dcgm-exporter# helm list -n monitoring +NAME NAMESPACE REVISION UPDATED STATUS CHART APP VERSION +dcgm-exporter monitoring 1 2024-10-02 16:32:35.691073696 +0800 CST deployed dcgm-exporter-3.1.7 3.1.7 +``` + +# Create ServiceMonitor + +```shell +#Create the file hami-device-plugin-svc-monitor.yaml +root@controller01:~# touch hami-device-plugin-svc-monitor.yaml +#The content of the file hami-device-plugin-svc-monitor.yaml +root@controller01:~# cat hami-device-plugin-svc-monitor.yaml +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: hami-device-plugin-svc-monitor + namespace: kube-system +spec: + selector: + matchLabels: + app.kubernetes.io/component: hami-device-plugin + namespaceSelector: + matchNames: + - "kube-system" + endpoints: + - path: /metrics + port: monitorport + interval: "15s" + honorLabels: false + relabelings: + - sourceLabels: [__meta_kubernetes_endpoints_name] + regex: hami-.* replacement: $1 action: keep - - source_labels: [__meta_kubernetes_pod_node_name] + - sourceLabels: [__meta_kubernetes_pod_node_name] regex: (.*) - target_label: node_name + targetLabel: node_name replacement: ${1} action: replace - - source_labels: [__meta_kubernetes_pod_host_ip] + - sourceLabels: [__meta_kubernetes_pod_host_ip] regex: (.*) - target_label: ip + targetLabel: ip replacement: $1 action: replace -- job_name: 'kubernetes-dcgm-exporter' - kubernetes_sd_configs: - - role: endpoints - relabel_configs: - - source_labels: [__meta_kubernetes_endpoints_name] - regex: dcgm-exporter + +#apply the file hami-device-plugin-svc-monitor.yaml +root@controller01:~# kubectl apply -f hami-device-plugin-svc-monitor.yaml +``` + +```shell +#Create the file hami-scheduler-svc-monitor.yaml +root@controller01:~# touch hami-device-plugin-svc-monitor.yaml +#The content of the file hami-scheduler-svc-monitor.yaml +root@controller01:~# cat hami-scheduler-svc-monitor.yaml +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: hami-scheduler-svc-monitor + namespace: kube-system +spec: + selector: + matchLabels: + app.kubernetes.io/component: hami-scheduler + namespaceSelector: + matchNames: + - "kube-system" + endpoints: + - path: /metrics + port: monitor + interval: "15s" + honorLabels: false + relabelings: + - sourceLabels: [__meta_kubernetes_endpoints_name] + regex: hami-.* replacement: $1 action: keep - - source_labels: [__meta_kubernetes_pod_node_name] + - sourceLabels: [__meta_kubernetes_pod_node_name] regex: (.*) - target_label: node_name + targetLabel: node_name replacement: ${1} action: replace - - source_labels: [__meta_kubernetes_pod_host_ip] + - sourceLabels: [__meta_kubernetes_pod_host_ip] regex: (.*) - target_label: ip + targetLabel: ip replacement: $1 action: replace + +#apply the file hami-scheduler-svc-monitor.yaml +root@controller01:~# kubectl apply -f hami-scheduler-svc-monitor.yaml +``` + +```shell +#Check the servicemonitors +root@controller01:~# kubectl -n kube-system get servicemonitor +NAME AGE +hami-device-plugin-svc-monitor 28h +hami-scheduler-svc-monitor 28h ``` -- reload promethues: +# Confirm the final monitoring effect -```bash -curl -XPOST http://{promethuesServer}:{port}/-/reload +```shell +#Create the file gpu-pod.yaml +root@controller01:~# touch gpu-pod.yaml +root@controller01:~# cat gpu-pod.yaml +apiVersion: v1 +kind: Pod +metadata: + name: gpu-pod-01 +spec: + restartPolicy: Never + containers: + - name: cuda-container + image: nvcr.io/nvidia/k8s/cuda-sample:vectoradd-cuda10.2 + resources: + limits: + nvidia.com/vgpu: 2 # requesting 2 vGPUs + nvidia.com/gpumem: 3000 # Each vGPU contains 3000m device memory (Optional,Integer) + nvidia.com/gpucores: 10 # Each vGPU uses 30% of the entire GPU (Optional,Integer) + +#apply the file gpu-pod.yaml +root@controller01:~# kubectl apply -f gpu-pod.yaml +root@controller01:~# kubectl get pods -o wide +NAME READY STATUS RESTARTS AGE IP NODE NOMINATED NODE READINESS GATES +gpu-pod-01 0/1 Completed 0 52s 10.233.81.70 controller01 ``` + +​ You can see the monitoring details in the dashboard. The contents are as follows: + +![image-20241003215400685](../imgs/hami-vgpu-metrics-dashboard.png) + diff --git a/docs/dashboard_cn.md b/docs/dashboard_cn.md index 20ee3ecb8..7bf0e2ac9 100644 --- a/docs/dashboard_cn.md +++ b/docs/dashboard_cn.md @@ -1,53 +1,205 @@ -## Grafana Dashboard +# 说明 -- 你可以在 grafana 中导入此 [gpu-dashboard.json](./gpu-dashboard.json) -- 此 dashboard 还包括一部分 NVIDIA DCGM 监控指标: +​ 为了阐述的简明性,本文只提供一种可行的办法,最终实现使用prometheus抓取监控指标并作为数据源、使用grafana来展示监控信息的目的。 - [dcgm-exporter](https://github.com/NVIDIA/dcgm-exporter)部署:`kubectl create -f https://raw.githubusercontent.com/NVIDIA/dcgm-exporter/master/dcgm-exporter.yaml` +​ 很多使用者通过issue反馈不知如何安装与配置相关组件导致无法正常使用相关dashboard,现将安装与配置步骤叙述如下,希望大家使用顺利!如有问题欢迎创建issue反馈,如有不足欢迎补充! -- 添加 prometheus 自定义的监控项: +​ 本文假设读者已经部署好Kubernetes集群、HAMi。以下涉及到的相关组件都是在kubernetes集群内安装的,相关组件或软件版本信息如下: -```yaml -- job_name: 'kubernetes-vgpu-exporter' - kubernetes_sd_configs: - - role: endpoints - relabel_configs: - - source_labels: [__meta_kubernetes_endpoints_name] - regex: vgpu-device-plugin-monitor +| 组件或软件名称 | 版本 | 备注 | +| --------------------- | ----------------- | --------------------- | +| kubernetes集群 | v1.23.10 | AMD64构架服务器环境下 | +| kube-prometheus stack | 分支 release-0.11 | | +| dcgm-exporter | tag 3.2.5-3.1.7 | | + +# 部署与配置kube-prometheus stack + +## 部署kube-prometheus stack + +**注:**kubernetes与kube-prometheus stack的版本兼容矩阵请查看 https://github.com/prometheus-operator/kube-prometheus?tab=readme-ov-file#compatibility ,请根据自己的kubernetes版本选择合适版本的kube-prometheus stack + +```shell +#下载kube-prometheus代码仓库(此处使用分支 release-0.11) +git clone -b release-0.11 https://github.com/prometheus-operator/kube-prometheus.git +cd kube-prometheus + +#修改下grafana的service类型为NodePort。即在spec下添加type配置项 +vi manifests/grafana-service.yaml +... +spec: + type: NodePort +... +#类似的方法修改prometheus与alertmanager service类型为NodePort,它们的配置文件分别是manifests目录下的prometheus-service.yaml与alertmanager-service.yaml + +#执行部署 +kubectl create -f manifests/setup/ +kubectl create -f manifests/. + +#创建的所有资源对象都在monitorin命名空间下,使用如下命令查看资源对象的运行状态 +kubectl -n monitoring get all +``` + +```shell +#等monitorin命名空间下所有资源对象处于正常运行状态后,使用如下方式获取grafana、prometheus与alertmanager的svc信息 +root@controller01:~/kube-prometheus# kubectl -n monitoring get svc | egrep "NAME|grafana|prometheus-k8s|alertmanager-main" +NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE +alertmanager-main NodePort 10.233.5.65 9093:30093/TCP,8080:30401/TCP 19h +grafana NodePort 10.233.56.112 3000:30300/TCP 19h +prometheus-k8s NodePort 10.233.38.113 9090:30090/TCP,8080:31273/TCP 19h +``` + +​ 此时,假如控制节点的ip是10.0.0.21,则可以分别使用如下url访问grafana、prometheus与alertmanager:http://10.0.0.21:30300 、http://10.0.0.21:30090 、http://10.0.0.21:30093 ,其中访问grafana的默认用户名与密码都是admin + +## 配置grafana + +### 创建数据源ALL + +​ 访问”Configuration“->“Data soutces”页面,创建一个名为"ALL"的数据源,其中HTTP.URL的值保持跟默认创建的数据源“prometheus”中的一样即可为 “http://prometheus-k8s.monitoring.svc:9090” ,然后保存上述数据源“ALL” + +### 导入HAMi默认的dashboard + +​ 访问“Dashboards”->“Browse”页面,导入此dashboard:https://grafana.com/grafana/dashboards/22043-hami-vgpu-metrics-dashboard/ ,grafana中将创建一个名为“hami-vgpu-metrics-dashboard”的dashboard,22043这个编号对应的dashboard在grafana8.5.5与grafana9.1.0验证过,在grafana9.1.0之后应该也能用。此时此页面中有一些Panel如vGPUCorePercentage还没有数据,请继续看完此文档,执行完"部署dcgm-exporter"与“创建ServiceMonitor”中的步骤之后Panel数据将正常显示。 + +​ 对于grafana8.5.5之前的版本如grafana7.5.17,请使用此dashboard:https://grafana.com/grafana/dashboards/21833-hami-vgpu-dashboard/ + +# 部署dcgm-exporter + +```shell +#下载dcgm-exporter代码仓库(未看到与kubernetes的兼容矩阵说明,此处使用tag 3.2.5-3.1.7) +git clone -b 3.2.5-3.1.7 https://github.com/NVIDIA/dcgm-exporter.git +cd dcgm-exporter + +#使用helm工具在monitoring空间下安装dcgm-exporter +helm install dcgm-exporter deployment/ -n monitoring + +#查看安装结果 +root@controller01:~/dcgm-exporter# helm list -n monitoring +NAME NAMESPACE REVISION UPDATED STATUS CHART APP VERSION +dcgm-exporter monitoring 1 2024-10-02 16:32:35.691073696 +0800 CST deployed dcgm-exporter-3.1.7 3.1.7 +``` + +# 创建ServiceMonitor + +```shell +#创建文件hami-device-plugin-svc-monitor.yaml +root@controller01:~# touch hami-device-plugin-svc-monitor.yaml +#文件hami-device-plugin-svc-monitor.yaml内容如下 +root@controller01:~# cat hami-device-plugin-svc-monitor.yaml +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: hami-device-plugin-svc-monitor + namespace: kube-system +spec: + selector: + matchLabels: + app.kubernetes.io/component: hami-device-plugin + namespaceSelector: + matchNames: + - "kube-system" + endpoints: + - path: /metrics + port: monitorport + interval: "15s" + honorLabels: false + relabelings: + - sourceLabels: [__meta_kubernetes_endpoints_name] + regex: hami-.* replacement: $1 action: keep - - source_labels: [__meta_kubernetes_pod_node_name] + - sourceLabels: [__meta_kubernetes_pod_node_name] regex: (.*) - target_label: node_name + targetLabel: node_name replacement: ${1} action: replace - - source_labels: [__meta_kubernetes_pod_host_ip] + - sourceLabels: [__meta_kubernetes_pod_host_ip] regex: (.*) - target_label: ip + targetLabel: ip replacement: $1 action: replace -- job_name: 'kubernetes-dcgm-exporter' - kubernetes_sd_configs: - - role: endpoints - relabel_configs: - - source_labels: [__meta_kubernetes_endpoints_name] - regex: dcgm-exporter + +#应用此文件 +root@controller01:~# kubectl apply -f hami-device-plugin-svc-monitor.yaml +``` + +```shell +#创建文件hami-scheduler-svc-monitor.yaml +root@controller01:~# touch hami-device-plugin-svc-monitor.yaml +#文件hami-scheduler-svc-monitor.yaml内容如下 +root@controller01:~# cat hami-scheduler-svc-monitor.yaml +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: hami-scheduler-svc-monitor + namespace: kube-system +spec: + selector: + matchLabels: + app.kubernetes.io/component: hami-scheduler + namespaceSelector: + matchNames: + - "kube-system" + endpoints: + - path: /metrics + port: monitor + interval: "15s" + honorLabels: false + relabelings: + - sourceLabels: [__meta_kubernetes_endpoints_name] + regex: hami-.* replacement: $1 action: keep - - source_labels: [__meta_kubernetes_pod_node_name] + - sourceLabels: [__meta_kubernetes_pod_node_name] regex: (.*) - target_label: node_name + targetLabel: node_name replacement: ${1} action: replace - - source_labels: [__meta_kubernetes_pod_host_ip] + - sourceLabels: [__meta_kubernetes_pod_host_ip] regex: (.*) - target_label: ip + targetLabel: ip replacement: $1 action: replace + +#应用此文件 +root@controller01:~# kubectl apply -f hami-scheduler-svc-monitor.yaml +``` + +```shell +#确认创建的ServiceMonitor +root@controller01:~# kubectl -n kube-system get servicemonitor +NAME AGE +hami-device-plugin-svc-monitor 28h +hami-scheduler-svc-monitor 28h ``` -- 加载 promethues 配置: +# 确认最终监控效果 + +```shell +#创建文件gpu-pod.yaml,尝试使用HAMi虚拟出来的NVIDIA vGPU +root@controller01:~# touch gpu-pod.yaml +root@controller01:~# cat gpu-pod.yaml +apiVersion: v1 +kind: Pod +metadata: + name: gpu-pod-01 +spec: + restartPolicy: Never + containers: + - name: cuda-container + image: nvcr.io/nvidia/k8s/cuda-sample:vectoradd-cuda10.2 + resources: + limits: + nvidia.com/vgpu: 2 # 请求2个vGPUs + nvidia.com/gpumem: 3000 # 每个vGPU申请3000m显存 (可选,整数类型) + nvidia.com/gpucores: 10 # 每个vGPU的算力为10%实际显卡的算力 (可选,整数类型) -```bash -curl -XPOST http://{promethuesServer}:{port}/-/reload +#应用此文件 +root@controller01:~# kubectl apply -f gpu-pod.yaml +root@controller01:~# kubectl get pods -o wide +NAME READY STATUS RESTARTS AGE IP NODE NOMINATED NODE READINESS GATES +gpu-pod-01 0/1 Completed 0 52s 10.233.81.70 controller01 ``` + +​ 此时,应该可以dashboard中看到监控详情。内容大概如下 + +![image-20241003215400685](../imgs/hami-vgpu-metrics-dashboard.png) \ No newline at end of file diff --git a/docs/gpu-dashboard.json b/docs/gpu-dashboard.json index 2f71c23ea..e5fb491d4 100644 --- a/docs/gpu-dashboard.json +++ b/docs/gpu-dashboard.json @@ -1,1150 +1,1834 @@ -{ - "annotations": { - "list": [ - { - "$$hashKey": "object:192", - "builtIn": 1, - "datasource": "-- Grafana --", - "enable": true, - "hide": true, - "iconColor": "rgba(0, 211, 255, 1)", - "name": "Annotations & Alerts", - "type": "dashboard" - } - ] - }, - "description": "This dashboard is gpu metrics dashboard base on NVIDIA DCGM Exporter and HAMi/k8s-vgpu-scheduler", - "editable": true, - "gnetId": 12239, - "graphTooltip": 0, - "id": 46, - "iteration": 1694498903162, - "links": [], - "panels": [ - { - "datasource": "ALL", - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [], - "max": 100, - "min": 0, - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "#EAB839", - "value": 83 - }, - { - "color": "red", - "value": 87 - } - ] - }, - "unit": "celsius" - }, - "overrides": [] - }, - "gridPos": { - "h": 10, - "w": 4, - "x": 0, - "y": 0 - }, - "id": 14, - "options": { - "orientation": "auto", - "reduceOptions": { - "calcs": [ - "mean" - ], - "fields": "", - "values": false - }, - "showThresholdLabels": false, - "showThresholdMarkers": true, - "text": {} - }, - "pluginVersion": "7.5.17", - "targets": [ - { - "expr": "avg(DCGM_FI_DEV_GPU_TEMP{node_name=~\"${node_name}\", gpu=~\"${gpu}\"})", - "interval": "", - "legendFormat": "", - "refId": "A" - } - ], - "timeFrom": null, - "timeShift": null, - "title": "GPU平均温度", - "type": "gauge" - }, - { - "cacheTimeout": null, - "datasource": "ALL", - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [], - "max": 2400, - "min": 0, - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "#EAB839", - "value": 1800 - }, - { - "color": "red", - "value": 2200 - } - ] - }, - "unit": "watt" - }, - "overrides": [] - }, - "gridPos": { - "h": 10, - "w": 4, - "x": 4, - "y": 0 - }, - "id": 16, - "links": [], - "options": { - "orientation": "horizontal", - "reduceOptions": { - "calcs": [ - "sum" - ], - "fields": "", - "values": false - }, - "showThresholdLabels": false, - "showThresholdMarkers": true, - "text": {} - }, - "pluginVersion": "7.5.17", - "targets": [ - { - "expr": "sum(DCGM_FI_DEV_POWER_USAGE{node_name=~\"${node_name}\", gpu=~\"${gpu}\"})", - "interval": "", - "legendFormat": "", - "refId": "A" - } - ], - "timeFrom": null, - "timeShift": null, - "title": "GPU总功率", - "type": "gauge" - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "ALL", - "fieldConfig": { - "defaults": { - "links": [] - }, - "overrides": [] - }, - "fill": 1, - "fillGradient": 0, - "gridPos": { - "h": 10, - "w": 8, - "x": 8, - "y": 0 - }, - "hiddenSeries": false, - "id": 12, - "legend": { - "alignAsTable": true, - "avg": false, - "current": false, - "max": false, - "min": false, - "rightSide": false, - "show": false, - "sort": "current", - "sortDesc": false, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 2, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.17", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "DCGM_FI_DEV_GPU_TEMP{node_name=~\"${node_name}\", gpu=~\"${gpu}\"}", - "instant": false, - "interval": "", - "legendFormat": "{{node_name}} gpu{{gpu}}", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "GPU温度", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "$$hashKey": "object:97", - "format": "celsius", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "$$hashKey": "object:98", - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "ALL", - "fieldConfig": { - "defaults": { - "links": [] - }, - "overrides": [] - }, - "fill": 1, - "fillGradient": 0, - "gridPos": { - "h": 10, - "w": 8, - "x": 16, - "y": 0 - }, - "hiddenSeries": false, - "id": 2, - "interval": "", - "legend": { - "alignAsTable": true, - "avg": true, - "current": true, - "max": true, - "min": false, - "rightSide": true, - "show": false, - "sideWidth": null, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 2, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.17", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "DCGM_FI_DEV_SM_CLOCK{node_name=~\"${node_name}\", gpu=~\"${gpu}\"} * 1000000", - "format": "time_series", - "interval": "", - "intervalFactor": 1, - "legendFormat": "{{node_name}} gpu{{gpu}}", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "GPU SM时钟频率(DCGM_FI_DEV_SM_CLOCK)", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "$$hashKey": "object:462", - "decimals": null, - "format": "hertz", - "label": "", - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "$$hashKey": "object:463", - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "ALL", - "fieldConfig": { - "defaults": { - "links": [] - }, - "overrides": [] - }, - "fill": 1, - "fillGradient": 0, - "gridPos": { - "h": 10, - "w": 12, - "x": 0, - "y": 10 - }, - "hiddenSeries": false, - "id": 18, - "legend": { - "avg": true, - "current": false, - "max": true, - "min": false, - "rightSide": false, - "show": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 2, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.17", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "DCGM_FI_DEV_FB_USED{node_name=~\"${node_name}\", gpu=~\"${gpu}\"}", - "interval": "", - "legendFormat": "{{node_name}} gpu{{gpu}}", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "GPU帧缓存(显存)使用量(DCGM_FI_DEV_FB_USED)", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "$$hashKey": "object:618", - "format": "decmbytes", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "$$hashKey": "object:619", - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "ALL", - "fieldConfig": { - "defaults": { - "links": [] - }, - "overrides": [] - }, - "fill": 1, - "fillGradient": 0, - "gridPos": { - "h": 10, - "w": 12, - "x": 12, - "y": 10 - }, - "hiddenSeries": false, - "id": 10, - "legend": { - "alignAsTable": false, - "avg": false, - "current": false, - "max": true, - "min": true, - "rightSide": false, - "show": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 2, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.17", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "DCGM_FI_DEV_POWER_USAGE{node_name=~\"${node_name}\", gpu=~\"${gpu}\"}", - "interval": "", - "legendFormat": "{{node_name}} gpu{{gpu}}", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "GPU功率消耗(DCGM_FI_DEV_POWER_USAGE)", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "$$hashKey": "object:214", - "format": "watt", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "$$hashKey": "object:215", - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "ALL", - "fieldConfig": { - "defaults": { - "links": [] - }, - "overrides": [] - }, - "fill": 1, - "fillGradient": 0, - "gridPos": { - "h": 10, - "w": 12, - "x": 0, - "y": 20 - }, - "hiddenSeries": false, - "id": 6, - "legend": { - "alignAsTable": false, - "avg": false, - "current": false, - "max": true, - "min": true, - "rightSide": false, - "show": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 2, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.17", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "DCGM_FI_DEV_GPU_UTIL{node_name=~\"${node_name}\", gpu=~\"${gpu}\"}", - "interval": "", - "legendFormat": "{{node_name}} gpu{{gpu}}", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "GPU利用率(DCGM_FI_DEV_GPU_UTIL)", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "cumulative" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "$$hashKey": "object:699", - "format": "percent", - "label": null, - "logBase": 1, - "max": "100", - "min": "0", - "show": true - }, - { - "$$hashKey": "object:700", - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": { - "uid": "ALL" - }, - "fieldConfig": { - "defaults": { - "links": [] - }, - "overrides": [] - }, - "fill": 1, - "fillGradient": 0, - "gridPos": { - "h": 10, - "w": 12, - "x": 0, - "y": 20 - }, - "hiddenSeries": false, - "id": 26, - "legend": { - "alignAsTable": false, - "avg": true, - "current": false, - "max": true, - "min": false, - "rightSide": false, - "show": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 2, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "9.3.14", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "datasource": { - "uid": "ALL" - }, - "exemplar": true, - "expr": "Device_utilization_desc_of_container{node_name=~\"${node_name}\"}", - "interval": "", - "legendFormat": "{{podname}}", - "refId": "A" - } - ], - "thresholds": [], - "timeRegions": [], - "title": "HAMi-pod算力使用率", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "mode": "time", - "show": true, - "values": [] - }, - "yaxes": [ - { - "$$hashKey": "object:779", - "format": "percent", - "logBase": 1, - "max": "100", - "min": "0", - "show": true - }, - { - "$$hashKey": "object:780", - "format": "short", - "logBase": 1, - "show": true - } - ], - "yaxis": { - "align": false - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "ALL", - "fieldConfig": { - "defaults": { - "links": [] - }, - "overrides": [] - }, - "fill": 1, - "fillGradient": 0, - "gridPos": { - "h": 10, - "w": 12, - "x": 12, - "y": 20 - }, - "hiddenSeries": false, - "id": 24, - "legend": { - "alignAsTable": false, - "avg": true, - "current": false, - "max": true, - "min": false, - "rightSide": false, - "show": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 2, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.17", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "Device_memory_desc_of_container{node_name=~\"${node_name}\"}", - "interval": "", - "legendFormat": "{{podname}}", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "HAMi-pod显存使用量(byte)", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "$$hashKey": "object:779", - "format": "bytes", - "label": null, - "logBase": 1, - "show": true - }, - { - "$$hashKey": "object:780", - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "ALL", - "fieldConfig": { - "defaults": { - "links": [] - }, - "overrides": [] - }, - "fill": 1, - "fillGradient": 0, - "gridPos": { - "h": 11, - "w": 12, - "x": 0, - "y": 30 - }, - "hiddenSeries": false, - "id": 22, - "legend": { - "alignAsTable": false, - "avg": false, - "current": false, - "max": true, - "min": true, - "rightSide": false, - "show": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 2, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.17", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "HostGPUMemoryUsage{node_name=~\"${node_name}\"}", - "interval": "", - "legendFormat": "{{node_name}} gpu {{deviceid}}", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "HAMi-节点GPU显存使用量", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "$$hashKey": "object:1087", - "format": "bytes", - "label": null, - "logBase": 1, - "show": true - }, - { - "$$hashKey": "object:1088", - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "ALL", - "fieldConfig": { - "defaults": { - "links": [] - }, - "overrides": [] - }, - "fill": 1, - "fillGradient": 0, - "gridPos": { - "h": 11, - "w": 12, - "x": 12, - "y": 30 - }, - "hiddenSeries": false, - "id": 20, - "legend": { - "alignAsTable": false, - "avg": false, - "current": false, - "max": true, - "min": true, - "rightSide": false, - "show": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 2, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.17", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "HostCoreUtilization{node_name=~\"${node_name}\"}", - "interval": "", - "legendFormat": "{{node_name}} gpu {{deviceid}}", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "HAMi-节点GPU算力使用率", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "$$hashKey": "object:1243", - "format": "percent", - "label": null, - "logBase": 1, - "max": "100", - "min": "0", - "show": true - }, - { - "$$hashKey": "object:1244", - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - } - ], - "refresh": false, - "schemaVersion": 27, - "style": "dark", - "tags": [], - "templating": { - "list": [ - { - "allValue": null, - "current": { - "selected": false - }, - "datasource": "ALL", - "definition": "label_values({__name__=~\"DCGM_FI_DEV_FB_FREE|vGPU_device_memory_limit_in_bytes\"}, node_name)", - "description": null, - "error": null, - "hide": 0, - "includeAll": false, - "label": null, - "multi": true, - "name": "node_name", - "options": [], - "query": { - "query": "label_values({__name__=~\"DCGM_FI_DEV_FB_FREE|vGPU_device_memory_limit_in_bytes\"}, node_name)", - "refId": "StandardVariableQuery" - }, - "refresh": 1, - "regex": "", - "skipUrlSync": false, - "sort": 0, - "tagValuesQuery": "", - "tags": [], - "tagsQuery": "", - "type": "query", - "useTags": false - }, - { - "allValue": null, - "current": { - "selected": false, - "text": [ - "All" - ], - "value": [ - "$__all" - ] - }, - "datasource": "ALL", - "definition": "label_values(DCGM_FI_DEV_FB_FREE{node_name=\"$node_name\"},gpu)", - "description": null, - "error": null, - "hide": 0, - "includeAll": true, - "label": null, - "multi": true, - "name": "gpu", - "options": [], - "query": { - "query": "label_values(DCGM_FI_DEV_FB_FREE{node_name=\"$node_name\"},gpu)", - "refId": "ALL-gpu-Variable-Query" - }, - "refresh": 1, - "regex": "", - "skipUrlSync": false, - "sort": 1, - "tagValuesQuery": "", - "tags": [], - "tagsQuery": "", - "type": "query", - "useTags": false - } - ] - }, - "time": { - "from": "now-12h", - "to": "now" - }, - "timepicker": { - "refresh_intervals": [ - "5s", - "10s", - "30s", - "1m", - "5m", - "15m", - "30m", - "1h", - "2h", - "1d" - ] - }, - "timezone": "", - "title": "k8s-vgpu-scheduler Dashboard", - "uid": "Oxed_c6Wz1", - "version": 3 -} +{ + "__inputs": [ + { + "name": "DS_ALL", + "label": "ALL", + "description": "", + "type": "datasource", + "pluginId": "prometheus", + "pluginName": "Prometheus" + } + ], + "__elements": [], + "__requires": [ + { + "type": "panel", + "id": "bargauge", + "name": "Bar gauge", + "version": "" + }, + { + "type": "panel", + "id": "gauge", + "name": "Gauge", + "version": "" + }, + { + "type": "grafana", + "id": "grafana", + "name": "Grafana", + "version": "8.5.5" + }, + { + "type": "panel", + "id": "graph", + "name": "Graph (old)", + "version": "" + }, + { + "type": "datasource", + "id": "prometheus", + "name": "Prometheus", + "version": "1.0.0" + }, + { + "type": "panel", + "id": "stat", + "name": "Stat", + "version": "" + }, + { + "type": "panel", + "id": "table", + "name": "Table", + "version": "" + } + ], + "annotations": { + "list": [ + { + "$$hashKey": "object:192", + "builtIn": 1, + "datasource": { + "type": "datasource", + "uid": "grafana" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "target": { + "limit": 100, + "matchAny": false, + "tags": [], + "type": "dashboard" + }, + "type": "dashboard" + } + ] + }, + "description": "This dashboard is gpu metrics dashboard base on NVIDIA DCGM Exporter and HAMi/k8s-vgpu-scheduler This dashboard is gpu metrics dashboard base on NVIDIA DCGM Exporter and HAMi/k8s-vgpu-scheduler, and was modified from https://grafana.com/grafana/dashboards/21833-hami-vgpu-dashboard/ ", + "editable": true, + "fiscalYearStartMonth": 0, + "gnetId": 21833, + "graphTooltip": 0, + "id": null, + "iteration": 1728370823317, + "links": [], + "liveNow": false, + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_ALL}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 3, + "x": 0, + "y": 0 + }, + "id": 47, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "text": {}, + "textMode": "auto" + }, + "pluginVersion": "8.5.5", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_ALL}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "count(DCGM_FI_DEV_DEC_UTIL{node_name=~\"${node_name}\",UUID=~\"${uuid}\"})", + "instant": true, + "range": false, + "refId": "A" + } + ], + "title": "GPU Total", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_ALL}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 2, + "x": 3, + "y": 0 + }, + "id": 59, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "text": {}, + "textMode": "auto" + }, + "pluginVersion": "8.5.5", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_ALL}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "sum(count_values(\"DCGM_FI_DEV_XID_ERRORS\", DCGM_FI_DEV_XID_ERRORS{node_name=~\"${node_name}\",UUID=~\"${uuid}\"} > 0)) or on() vector(0)", + "instant": true, + "range": false, + "refId": "A" + } + ], + "title": "XID Error", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_ALL}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "max": 100, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "#EAB839", + "value": 83 + }, + { + "color": "red", + "value": 87 + } + ] + }, + "unit": "celsius" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 3, + "x": 5, + "y": 0 + }, + "id": 55, + "options": { + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "mean" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true, + "text": {} + }, + "pluginVersion": "8.5.5", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_ALL}" + }, + "exemplar": true, + "expr": "avg(DCGM_FI_DEV_GPU_TEMP{node_name=~\"${node_name}\",UUID=~\"${uuid}\"})", + "interval": "", + "legendFormat": "", + "refId": "A" + } + ], + "title": "Average GPU Temp", + "type": "gauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_ALL}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "max": 2400, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "#EAB839", + "value": 1800 + }, + { + "color": "red", + "value": 2200 + } + ] + }, + "unit": "watt" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 3, + "x": 8, + "y": 0 + }, + "id": 57, + "links": [], + "options": { + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "sum" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true, + "text": {} + }, + "pluginVersion": "8.5.5", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_ALL}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "sum(DCGM_FI_DEV_POWER_USAGE{node_name=~\"${node_name}\",UUID=~\"${uuid}\"})", + "instant": true, + "interval": "", + "legendFormat": "", + "range": false, + "refId": "A" + } + ], + "title": "GPU power usage", + "type": "gauge" + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": { + "type": "prometheus", + "uid": "${DS_ALL}" + }, + "fieldConfig": { + "defaults": { + "links": [] + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 5, + "w": 13, + "x": 11, + "y": 0 + }, + "hiddenSeries": false, + "id": 12, + "legend": { + "alignAsTable": false, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": false, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": 150, + "sort": "current", + "sortDesc": false, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "8.5.5", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_ALL}" + }, + "exemplar": true, + "expr": "DCGM_FI_DEV_GPU_TEMP{node_name=~\"${node_name}\",UUID=~\"${uuid}\"}", + "instant": false, + "interval": "", + "legendFormat": "{{node_name}} uid:{{UUID}}", + "refId": "A" + } + ], + "thresholds": [], + "timeRegions": [], + "title": "GPU temp(DCGM)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:97", + "format": "celsius", + "logBase": 1, + "show": true + }, + { + "$$hashKey": "object:98", + "format": "short", + "logBase": 1, + "show": true + } + ], + "yaxis": { + "align": false + } + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_ALL}" + }, + "fieldConfig": { + "defaults": { + "custom": { + "align": "auto", + "displayMode": "auto", + "filterable": false, + "inspect": false + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 11, + "x": 0, + "y": 5 + }, + "id": 53, + "options": { + "footer": { + "fields": "", + "reducer": [ + "sum" + ], + "show": false + }, + "showHeader": true, + "sortBy": [ + { + "desc": true, + "displayName": "instance" + } + ] + }, + "pluginVersion": "8.5.5", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_ALL}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "nodeGPUOverview{nodeid=~\"${node_name}\",deviceuuid=~\"${uuid}\"}", + "instant": true, + "range": false, + "refId": "A" + } + ], + "title": "nodeGPUOverview", + "transformations": [ + { + "id": "labelsToFields", + "options": {} + }, + { + "id": "merge", + "options": {} + }, + { + "id": "merge", + "options": {} + }, + { + "id": "organize", + "options": { + "excludeByName": { + "Time": true, + "Value": true, + "branch": true, + "container": true, + "dc": true, + "devicecores": true, + "deviceidx": true, + "devicememorylimit": false, + "goversion": true, + "instance": true, + "ip": true, + "job": true, + "namespace": true, + "node_name": true, + "nodeid": false, + "pod": true, + "project": true, + "revision": true, + "sharedcontainers": false, + "zone": true + }, + "indexByName": { + "Time": 2, + "Value": 3, + "dc": 4, + "devicecores": 5, + "deviceidx": 6, + "devicememorylimit": 11, + "devicetype": 10, + "deviceuuid": 12, + "instance": 1, + "ip": 9, + "job": 0, + "node_name": 8, + "nodeid": 7, + "project": 13, + "sharedcontainers": 14, + "zone": 15 + }, + "renameByName": { + "devicememorylimit": "vram", + "devicetype": "", + "node_name": "", + "sharedcontainers": "" + } + } + } + ], + "type": "table" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_ALL}" + }, + "fieldConfig": { + "defaults": { + "custom": { + "align": "auto", + "displayMode": "auto", + "filterable": false, + "inspect": false + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 13, + "x": 11, + "y": 5 + }, + "id": 61, + "options": { + "footer": { + "fields": "", + "reducer": [ + "sum" + ], + "show": false + }, + "showHeader": true, + "sortBy": [ + { + "desc": true, + "displayName": "instance" + } + ] + }, + "pluginVersion": "8.5.5", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_ALL}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "DCGM_FI_DEV_POWER_USAGE{node_name=~\"${node_name}\", UUID=~\"${uuid}\"}", + "instant": true, + "range": false, + "refId": "A" + } + ], + "title": "nodeGPUList(DCGM)", + "transformations": [ + { + "id": "labelsToFields", + "options": {} + }, + { + "id": "merge", + "options": {} + }, + { + "id": "merge", + "options": {} + }, + { + "id": "organize", + "options": { + "excludeByName": { + "DCGM_FI_CUDA_DRIVER_VERSION": false, + "DCGM_FI_DEV_BRAND": true, + "DCGM_FI_DEV_MINOR_NUMBER": true, + "DCGM_FI_DEV_NAME": true, + "DCGM_FI_DEV_SERIAL": true, + "DCGM_FI_DRIVER_VERSION": false, + "DCGM_FI_PROCESS_NAME": true, + "Hostname": true, + "Time": true, + "UUID": false, + "Value": true, + "branch": true, + "container": true, + "dc": true, + "device": true, + "devicecores": true, + "deviceidx": true, + "goversion": true, + "gpu": true, + "instance": true, + "ip": true, + "job": true, + "modelName": false, + "namespace": true, + "node_name": false, + "nodeid": false, + "pod": true, + "project": true, + "revision": true, + "sharedcontainers": false, + "zone": true + }, + "indexByName": { + "DCGM_FI_CUDA_DRIVER_VERSION": 11, + "DCGM_FI_DEV_BRAND": 13, + "DCGM_FI_DEV_MINOR_NUMBER": 14, + "DCGM_FI_DEV_NAME": 15, + "DCGM_FI_DEV_SERIAL": 16, + "DCGM_FI_DRIVER_VERSION": 12, + "DCGM_FI_PROCESS_NAME": 17, + "Hostname": 6, + "Time": 1, + "UUID": 19, + "Value": 2, + "dc": 3, + "device": 10, + "gpu": 7, + "instance": 5, + "ip": 18, + "job": 0, + "modelName": 9, + "node_name": 8, + "project": 4 + }, + "renameByName": { + "devicememorylimit": "vram", + "devicetype": "", + "node_name": "", + "sharedcontainers": "" + } + } + } + ], + "type": "table" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_ALL}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "max": 1, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 11, + "x": 0, + "y": 10 + }, + "id": 49, + "interval": "", + "links": [], + "options": { + "displayMode": "basic", + "minVizHeight": 10, + "minVizWidth": 0, + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showUnfilled": true, + "text": {} + }, + "pluginVersion": "8.5.5", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_ALL}" + }, + "exemplar": true, + "expr": "sum by (deviceuuid, nodename) (vGPUCorePercentage{nodename=~\"${node_name}\"})", + "format": "time_series", + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": " {{nodename}}:{{deviceuuid}}", + "refId": "A" + } + ], + "title": "vGPUCorePercentage", + "type": "bargauge" + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": { + "type": "prometheus", + "uid": "${DS_ALL}" + }, + "fieldConfig": { + "defaults": { + "links": [] + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 7, + "x": 11, + "y": 10 + }, + "hiddenSeries": false, + "id": 18, + "legend": { + "avg": false, + "current": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "8.5.5", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_ALL}" + }, + "exemplar": true, + "expr": "DCGM_FI_DEV_FB_USED{node_name=~\"${node_name}\", UUID=~\"${uuid}\"}", + "interval": "", + "legendFormat": "{{node_name}} uid:{{UUID}}", + "refId": "A" + } + ], + "thresholds": [], + "timeRegions": [], + "title": "GPU FB used(DCGM)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:618", + "format": "decmbytes", + "logBase": 1, + "show": true + }, + { + "$$hashKey": "object:619", + "format": "short", + "logBase": 1, + "show": true + } + ], + "yaxis": { + "align": false + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": { + "type": "prometheus", + "uid": "${DS_ALL}" + }, + "fieldConfig": { + "defaults": { + "links": [] + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 6, + "x": 18, + "y": 10 + }, + "hiddenSeries": false, + "id": 6, + "legend": { + "alignAsTable": false, + "avg": false, + "current": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "8.5.5", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_ALL}" + }, + "exemplar": true, + "expr": "DCGM_FI_DEV_GPU_UTIL{node_name=~\"${node_name}\", UUID=~\"${uuid}\"}", + "interval": "", + "legendFormat": "{{node_name}} uid:{{UUID}}", + "refId": "A" + } + ], + "thresholds": [], + "timeRegions": [], + "title": "GPU util(DCGM)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:699", + "format": "percent", + "logBase": 1, + "max": "100", + "min": "0", + "show": true + }, + { + "$$hashKey": "object:700", + "format": "short", + "logBase": 1, + "show": true + } + ], + "yaxis": { + "align": false + } + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_ALL}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "max": 1, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 11, + "x": 0, + "y": 15 + }, + "id": 51, + "interval": "", + "links": [], + "options": { + "displayMode": "basic", + "minVizHeight": 10, + "minVizWidth": 0, + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showUnfilled": true, + "text": {} + }, + "pluginVersion": "8.5.5", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_ALL}" + }, + "exemplar": true, + "expr": "sum by (deviceuuid, nodename) (vGPUMemoryPercentage{nodename=~\"${node_name}\"})", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{nodename}}:{{deviceuuid}}", + "refId": "A" + } + ], + "title": "vGPUMemoryPercentage", + "type": "bargauge" + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": { + "type": "prometheus", + "uid": "${DS_ALL}" + }, + "fieldConfig": { + "defaults": { + "links": [] + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 7, + "x": 11, + "y": 18 + }, + "hiddenSeries": false, + "id": 10, + "legend": { + "alignAsTable": false, + "avg": false, + "current": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "8.5.5", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_ALL}" + }, + "exemplar": true, + "expr": "DCGM_FI_DEV_POWER_USAGE{node_name=~\"${node_name}\",UUID=~\"${uuid}\"}", + "interval": "", + "legendFormat": "{{node_name}} uid:{{UUID}}", + "refId": "A" + } + ], + "thresholds": [], + "timeRegions": [], + "title": "GPU power usage(DCGM)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:214", + "format": "watt", + "logBase": 1, + "show": true + }, + { + "$$hashKey": "object:215", + "format": "short", + "logBase": 1, + "show": true + } + ], + "yaxis": { + "align": false + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": { + "type": "prometheus", + "uid": "${DS_ALL}" + }, + "fieldConfig": { + "defaults": { + "unit": "hertz" + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 6, + "x": 18, + "y": 18 + }, + "hiddenSeries": false, + "id": 65, + "interval": "", + "legend": { + "alignAsTable": false, + "avg": false, + "current": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": 80, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "8.5.5", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_ALL}" + }, + "exemplar": true, + "expr": "DCGM_FI_DEV_SM_CLOCK{node_name=~\"${node_name}\", UUID=~\"${uuid}\"} * 1000000", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{node_name}} uid:{{UUID}}", + "refId": "A" + } + ], + "thresholds": [], + "timeRegions": [], + "title": "GPU SM Clock(DCGM)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:536", + "format": "hertz", + "label": "", + "logBase": 1, + "show": true + }, + { + "$$hashKey": "object:537", + "format": "short", + "logBase": 1, + "show": true + } + ], + "yaxis": { + "align": false + } + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_ALL}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "max": 10, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "index": 0 + }, + { + "color": "red", + "index": 1, + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 11, + "x": 0, + "y": 20 + }, + "id": 36, + "interval": "", + "links": [], + "options": { + "displayMode": "basic", + "minVizHeight": 10, + "minVizWidth": 0, + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showUnfilled": true, + "text": {} + }, + "pluginVersion": "8.5.5", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_ALL}" + }, + "exemplar": true, + "expr": "count by (node_name) (vGPU_device_memory_usage_in_bytes{node_name=~\"${node_name}\"})", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{node_name}}", + "refId": "A" + } + ], + "title": "vgpu used by nodes", + "type": "bargauge" + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": { + "type": "prometheus", + "uid": "${DS_ALL}" + }, + "fieldConfig": { + "defaults": { + "links": [] + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 11, + "x": 0, + "y": 25 + }, + "hiddenSeries": false, + "id": 24, + "legend": { + "alignAsTable": false, + "avg": false, + "current": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": 150, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "8.5.5", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_ALL}" + }, + "exemplar": true, + "expr": "sum by (podname) (Device_memory_desc_of_container{node_name=~\"${node_name}\",deviceuuid=~\"${uuid}\"})", + "interval": "", + "legendFormat": "{{podname}}", + "refId": "A" + } + ], + "thresholds": [], + "timeRegions": [], + "title": "HAMi-Memory desc of container ", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:779", + "format": "bytes", + "logBase": 1, + "show": true + }, + { + "$$hashKey": "object:780", + "format": "short", + "logBase": 1, + "show": true + } + ], + "yaxis": { + "align": false + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": { + "type": "prometheus", + "uid": "${DS_ALL}" + }, + "decimals": 150, + "fieldConfig": { + "defaults": { + "links": [] + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 13, + "x": 11, + "y": 25 + }, + "hiddenSeries": false, + "id": 38, + "legend": { + "alignAsTable": false, + "avg": false, + "current": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": 150, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "8.5.5", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_ALL}" + }, + "exemplar": true, + "expr": "sum by (podname) (Device_utilization_desc_of_container{node_name=~\"${node_name}\",deviceuuid=~\"${uuid}\"})", + "interval": "", + "legendFormat": "{{podname}}", + "refId": "A" + } + ], + "thresholds": [], + "timeRegions": [], + "title": "HAMi-GPU util desc of container", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:1243", + "format": "percent", + "logBase": 1, + "max": "100", + "min": "0", + "show": true + }, + { + "$$hashKey": "object:1244", + "format": "short", + "logBase": 1, + "show": true + } + ], + "yaxis": { + "align": false + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": { + "type": "prometheus", + "uid": "${DS_ALL}" + }, + "fieldConfig": { + "defaults": { + "links": [] + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 11, + "x": 0, + "y": 35 + }, + "hiddenSeries": false, + "id": 22, + "legend": { + "alignAsTable": false, + "avg": false, + "current": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": 150, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "8.5.5", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_ALL}" + }, + "exemplar": true, + "expr": "HostGPUMemoryUsage{node_name=~\"${node_name}\",deviceuuid=~\"${uuid}\"}", + "interval": "", + "legendFormat": "{{node_name}} uid:{{deviceuuid}}", + "refId": "A" + } + ], + "thresholds": [], + "timeRegions": [], + "title": "HAMi-Host memory usage", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:1087", + "format": "bytes", + "logBase": 1, + "show": true + }, + { + "$$hashKey": "object:1088", + "format": "short", + "logBase": 1, + "show": true + } + ], + "yaxis": { + "align": false + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": { + "type": "prometheus", + "uid": "${DS_ALL}" + }, + "fieldConfig": { + "defaults": { + "links": [] + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 13, + "x": 11, + "y": 35 + }, + "hiddenSeries": false, + "id": 20, + "legend": { + "alignAsTable": false, + "avg": false, + "current": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": 150, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "8.5.5", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_ALL}" + }, + "exemplar": true, + "expr": "HostCoreUtilization{node_name=~\"${node_name}\",deviceuuid=~\"${uuid}\"}", + "interval": "", + "legendFormat": "{{node_name}} uid:{{deviceuuid}}", + "refId": "A" + } + ], + "thresholds": [], + "timeRegions": [], + "title": "HAMi-Host core util", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:1243", + "format": "percent", + "logBase": 1, + "max": "100", + "min": "0", + "show": true + }, + { + "$$hashKey": "object:1244", + "format": "short", + "logBase": 1, + "show": true + } + ], + "yaxis": { + "align": false + } + } + ], + "refresh": "5s", + "schemaVersion": 36, + "style": "dark", + "tags": [], + "templating": { + "list": [ + { + "current": {}, + "datasource": { + "type": "prometheus", + "uid": "${DS_ALL}" + }, + "definition": "label_values({__name__=~\"DCGM_FI_DEV_FB_FREE|vGPU_device_memory_limit_in_bytes\"}, node_name)", + "hide": 0, + "includeAll": true, + "multi": true, + "name": "node_name", + "options": [], + "query": { + "query": "label_values({__name__=~\"DCGM_FI_DEV_FB_FREE|vGPU_device_memory_limit_in_bytes\"}, node_name)", + "refId": "StandardVariableQuery" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 1, + "tagValuesQuery": "", + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "current": {}, + "datasource": { + "type": "prometheus", + "uid": "${DS_ALL}" + }, + "definition": "label_values(DCGM_FI_DEV_FB_FREE{node_name=~\"$node_name\"},UUID)", + "hide": 0, + "includeAll": true, + "multi": true, + "name": "uuid", + "options": [], + "query": { + "query": "label_values(DCGM_FI_DEV_FB_FREE{node_name=~\"$node_name\"},UUID)", + "refId": "StandardVariableQuery" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 1, + "tagValuesQuery": "", + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-12h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ] + }, + "timezone": "browser", + "title": "hami-vgpu-metrics-dashboard", + "uid": "Oxed_c6Wz22", + "version": 2, + "weekStart": "" +} \ No newline at end of file diff --git a/imgs/hami-vgpu-metrics-dashboard.png b/imgs/hami-vgpu-metrics-dashboard.png new file mode 100644 index 000000000..0f322f7eb Binary files /dev/null and b/imgs/hami-vgpu-metrics-dashboard.png differ