From 694375287f8187470f87aa21f7ee791ef2a2d697 Mon Sep 17 00:00:00 2001
From: Chandrakala Subramanyam
Date: Wed, 18 Sep 2024 15:20:35 +0530
Subject: [PATCH 01/64] Test plan for kruize rel 0.0.25
Signed-off-by: Chandrakala Subramanyam
---
tests/test_plans/test_plan_rel_0.0.25.md | 134 +++++++++++++++++++++++
1 file changed, 134 insertions(+)
create mode 100644 tests/test_plans/test_plan_rel_0.0.25.md
diff --git a/tests/test_plans/test_plan_rel_0.0.25.md b/tests/test_plans/test_plan_rel_0.0.25.md
new file mode 100644
index 000000000..012166e1b
--- /dev/null
+++ b/tests/test_plans/test_plan_rel_0.0.25.md
@@ -0,0 +1,134 @@
+# KRUIZE TEST PLAN RELEASE 0.0.25
+
+- [INTRODUCTION](#introduction)
+- [FEATURES TO BE TESTED](#features-to-be-tested)
+- [BUG FIXES TO BE TESTED](#bug-fixes-to-be-tested)
+- [TEST ENVIRONMENT](#test-environment)
+- [TEST DELIVERABLES](#test-deliverables)
+ - [New Test Cases Developed](#new-test-cases-developed)
+ - [Regression Testing](#regresion-testing)
+- [SCALABILITY TESTING](#scalability-testing)
+- [RELEASE TESTING](#release-testing)
+- [TEST METRICS](#test-metrics)
+- [RISKS AND CONTINGENCIES](#risks-and-contingencies)
+- [APPROVALS](#approvals)
+
+-----
+
+## INTRODUCTION
+
+This document describes the test plan for Kruize remote monitoring release 0.0.25
+
+----
+
+## FEATURES TO BE TESTED
+
+* Addition of Metric profile json into Kruize manifests
+* Support for Datasource authentication using bearer token
+* Support for Kruize Local Namespace level recommendations
+
+------
+
+## BUG FIXES TO BE TESTED
+
+* Configure openshift port for prometheus service
+
+---
+
+## TEST ENVIRONMENT
+
+* Minikube Cluster
+* Openshift Cluster
+
+---
+
+## TEST DELIVERABLES
+
+### New Test Cases Developed
+
+| # | ISSUE (NEW FEATURE) | TEST DESCRIPTION | TEST DELIVERABLES | RESULTS | COMMENTS |
+|---|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-----------------------------------------------------------------------------------------|--------------------------------------------------------|---------| --- |
+| 1 | Addition of Metric profile json into Kruize manifests | Metric profile json location update in existing tests and demos | | PASSED | |
+| 2 | [Support for Datasource authentication using bearer token](https://github.com/kruize/autotune/pull/1289) | Tested manually | | PASSED | |
+| 3 | Support for Kruize Local Namespace level recommendations [1248](https://github.com/kruize/autotune/pull/1248), [1249](https://github.com/kruize/autotune/pull/1249), [1275](https://github.com/kruize/autotune/pull/1275) | [New tests added](https://github.com/kruize/autotune/pull/1293) | | | |
+| 4 | [Configure openshift port for prometheus service](https://github.com/kruize/autotune/pull/1278) | Updated existing tests to test with the specified datasource service name and namespace | [1291](https://github.com/kruize/autotune/pull/1291) | PASSED | |
+
+
+
+### Regression Testing
+
+| # | ISSUE (BUG/NEW FEATURE) | TEST CASE | RESULTS | COMMENTS |
+|---|-------------------------------------------------------|---------------------------------------------------------|---------| --- |
+| 1 | Addition of Metric profile json into Kruize manifests | Kruize local monitoring tests and local monitoring demo | PASSED | |
+| 2 | Configure openshift port for prometheus service | Kruize local monitoring functional tests | PASSED | |
+
+---
+
+## SCALABILITY TESTING
+
+Evaluate Kruize Scalability on OCP, with 5k experiments by uploading resource usage data for 15 days and update recommendations.
+Changes do not have scalability implications. Short scalability test will be run as part of the release testing
+
+Short Scalability run
+- 5K exps / 15 days of results / 2 containers per exp
+- Kruize replicas - 10
+- OCP - Scalelab cluster
+
+Kruize Release | Exps / Results / Recos | Execution time | Latency (Max/ Avg) in seconds | | | Postgres DB size(MB) | Kruize Max CPU | Kruize Max Memory (GB)
+-- |------------------------|----------------|-------------------------------|---------------|----------------------|----------------------|----------------| --
+ | | | | UpdateRecommendations | UpdateResults | LoadResultsByExpName | | |
+0.0.24_mvp | 5K / 72L / 3L | 4h 04 mins | 0.8 / 0.47 | 0.13 / 0.12 | 0.53 / 0.36 | 21752 | 4.63 | 34.72
+0.0.25_mvp | 5K / 72L / 3L | 4h 05 mins | 0.8 / 0.47 | 0.13 / 0.12 | 0.53 / 0.36 | 21783 | 4.61 | 36.81
+
+----
+## RELEASE TESTING
+
+As part of the release testing, following tests will be executed:
+- [Kruize Remote monitoring Functional tests](/tests/scripts/remote_monitoring_tests/Remote_monitoring_tests.md)
+- [Fault tolerant test](/tests/scripts/remote_monitoring_tests/fault_tolerant_tests.md)
+- [Stress test](/tests/scripts/remote_monitoring_tests/README.md)
+- [DB Migration test](/tests/scripts/remote_monitoring_tests/db_migration_test.md)
+- [Recommendation and box plot values validation test](https://github.com/kruize/kruize-demos/blob/main/monitoring/remote_monitoring_demo/recommendations_infra_demo/README.md)
+- [Scalability test (On openshift)](/tests/scripts/remote_monitoring_tests/scalability_test.md) - scalability test with 5000 exps / 15 days usage data
+- [Kruize remote monitoring demo (On minikube)](https://github.com/kruize/kruize-demos/blob/main/monitoring/remote_monitoring_demo/README.md)
+- [Kruize local monitoring demo (On openshift)](https://github.com/kruize/kruize-demos/blob/main/monitoring/local_monitoring_demo)
+- [Kruize local monitoring Functional tests](/tests/scripts/local_monitoring_tests/Local_monitoring_tests.md)
+
+
+| # | TEST SUITE | EXPECTED RESULTS | ACTUAL RESULTS | COMMENTS |
+| --- | ---------- |-----------------------------------------|-----------------------------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| 1 | Kruize Remote monitoring Functional testsuite | TOTAL - 359, PASSED - 308 / FAILED - 51 | TOTAL - 359, PASSED - 308 / FAILED - 51 | Intermittent issue seen [1281](https://github.com/kruize/autotune/issues/1281), existing issues - [559](https://github.com/kruize/autotune/issues/559), [610](https://github.com/kruize/autotune/issues/610) |
+| 2 | Fault tolerant test | PASSED | PASSED | |
+| 3 | Stress test | PASSED | PASSED | |
+| 4 | Scalability test (short run)| | | Exps - 5000, Results - 72000, execution time - 3 hours 51 mins |
+| 5 | DB Migration test | PASSED | PASSED | Tested on Openshift |
+| 6 | Recommendation and box plot values validations | PASSED | PASSED | |
+| 7 | Kruize remote monitoring demo | PASSED | PASSED | Tested manually |
+| 8 | Kruize Local monitoring demo | PASSED | PASSED | |
+| 9 | Kruize Local Functional tests | TOTAL - 64, PASSED - 60 / FAILED - 4 | TOTAL - 64, PASSED - 60 / FAILED - 4 | [Issue 1217](https://github.com/kruize/autotune/issues/1217), [Issue 1273](https://github.com/kruize/autotune/issues/1273) |
+
+---
+
+## TEST METRICS
+
+### Test Completion Criteria
+
+* All must_fix defects identified for the release are fixed
+* New features work as expected and tests have been added to validate these
+* No new regressions in the functional tests
+* All non-functional tests work as expected without major issues
+* Documentation updates have been completed
+
+----
+
+## RISKS AND CONTINGENCIES
+
+* None
+
+----
+## APPROVALS
+
+Sign-off
+
+----
+
From 1afeb01e2cda006dd9186be8b1f412b9669248b3 Mon Sep 17 00:00:00 2001
From: Chandrakala Subramanyam
Date: Mon, 23 Sep 2024 16:18:02 +0530
Subject: [PATCH 02/64] Updated test results
Signed-off-by: Chandrakala Subramanyam
---
tests/test_plans/test_plan_rel_0.0.25.md | 8 ++++----
1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/tests/test_plans/test_plan_rel_0.0.25.md b/tests/test_plans/test_plan_rel_0.0.25.md
index 012166e1b..58d1483df 100644
--- a/tests/test_plans/test_plan_rel_0.0.25.md
+++ b/tests/test_plans/test_plan_rel_0.0.25.md
@@ -78,7 +78,7 @@ Kruize Release | Exps / Results / Recos | Execution time | Latency (Max/ Avg) in
-- |------------------------|----------------|-------------------------------|---------------|----------------------|----------------------|----------------| --
| | | | UpdateRecommendations | UpdateResults | LoadResultsByExpName | | |
0.0.24_mvp | 5K / 72L / 3L | 4h 04 mins | 0.8 / 0.47 | 0.13 / 0.12 | 0.53 / 0.36 | 21752 | 4.63 | 34.72
-0.0.25_mvp | 5K / 72L / 3L | 4h 05 mins | 0.8 / 0.47 | 0.13 / 0.12 | 0.53 / 0.36 | 21783 | 4.61 | 36.81
+0.0.25_mvp | 5K / 72L / 3L | 4h 04 mins | 0.79 / 0.47 | 0.13 / 0.12 | 0.52 / 0.36 | 21760 | 4.48 | 38.84
----
## RELEASE TESTING
@@ -97,15 +97,15 @@ As part of the release testing, following tests will be executed:
| # | TEST SUITE | EXPECTED RESULTS | ACTUAL RESULTS | COMMENTS |
| --- | ---------- |-----------------------------------------|-----------------------------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
-| 1 | Kruize Remote monitoring Functional testsuite | TOTAL - 359, PASSED - 308 / FAILED - 51 | TOTAL - 359, PASSED - 308 / FAILED - 51 | Intermittent issue seen [1281](https://github.com/kruize/autotune/issues/1281), existing issues - [559](https://github.com/kruize/autotune/issues/559), [610](https://github.com/kruize/autotune/issues/610) |
+| 1 | Kruize Remote monitoring Functional testsuite | TOTAL - 359, PASSED - 292 / FAILED - 67 | TOTAL - 359, PASSED - 292 / FAILED - 67 | Intermittent issue seen [1281](https://github.com/kruize/autotune/issues/1281), existing issues - [559](https://github.com/kruize/autotune/issues/559), [610](https://github.com/kruize/autotune/issues/610) |
| 2 | Fault tolerant test | PASSED | PASSED | |
| 3 | Stress test | PASSED | PASSED | |
-| 4 | Scalability test (short run)| | | Exps - 5000, Results - 72000, execution time - 3 hours 51 mins |
+| 4 | Scalability test (short run)| | | Exps - 5000, Results - 72000, execution time - 4 hours 4 mins |
| 5 | DB Migration test | PASSED | PASSED | Tested on Openshift |
| 6 | Recommendation and box plot values validations | PASSED | PASSED | |
| 7 | Kruize remote monitoring demo | PASSED | PASSED | Tested manually |
| 8 | Kruize Local monitoring demo | PASSED | PASSED | |
-| 9 | Kruize Local Functional tests | TOTAL - 64, PASSED - 60 / FAILED - 4 | TOTAL - 64, PASSED - 60 / FAILED - 4 | [Issue 1217](https://github.com/kruize/autotune/issues/1217), [Issue 1273](https://github.com/kruize/autotune/issues/1273) |
+| 9 | Kruize Local Functional tests | TOTAL - 77, PASSED - 72 / FAILED - 3 | TOTAL - 77, PASSED - 72 / FAILED - 5 | [Issue 1217](https://github.com/kruize/autotune/issues/1217), [Issue 1273](https://github.com/kruize/autotune/issues/1273) |
---
From 8be64e0435e81328ad371792ab263748908dfc10 Mon Sep 17 00:00:00 2001
From: Chandrakala Subramanyam
Date: Fri, 27 Sep 2024 23:07:28 +0530
Subject: [PATCH 03/64] Updated the test results with the final build
Signed-off-by: Chandrakala Subramanyam
---
tests/test_plans/test_plan_rel_0.0.25.md | 8 ++++----
1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/tests/test_plans/test_plan_rel_0.0.25.md b/tests/test_plans/test_plan_rel_0.0.25.md
index 58d1483df..312ca0a8c 100644
--- a/tests/test_plans/test_plan_rel_0.0.25.md
+++ b/tests/test_plans/test_plan_rel_0.0.25.md
@@ -78,7 +78,7 @@ Kruize Release | Exps / Results / Recos | Execution time | Latency (Max/ Avg) in
-- |------------------------|----------------|-------------------------------|---------------|----------------------|----------------------|----------------| --
| | | | UpdateRecommendations | UpdateResults | LoadResultsByExpName | | |
0.0.24_mvp | 5K / 72L / 3L | 4h 04 mins | 0.8 / 0.47 | 0.13 / 0.12 | 0.53 / 0.36 | 21752 | 4.63 | 34.72
-0.0.25_mvp | 5K / 72L / 3L | 4h 04 mins | 0.79 / 0.47 | 0.13 / 0.12 | 0.52 / 0.36 | 21760 | 4.48 | 38.84
+0.0.25_mvp | 5K / 72L / 3L | 4h 06 mins | 0.8 / 0.47 | 0.14 / 0.12 | 0.52 / 0.36 | 21756 | 4.91 | 30.13
----
## RELEASE TESTING
@@ -97,15 +97,15 @@ As part of the release testing, following tests will be executed:
| # | TEST SUITE | EXPECTED RESULTS | ACTUAL RESULTS | COMMENTS |
| --- | ---------- |-----------------------------------------|-----------------------------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
-| 1 | Kruize Remote monitoring Functional testsuite | TOTAL - 359, PASSED - 292 / FAILED - 67 | TOTAL - 359, PASSED - 292 / FAILED - 67 | Intermittent issue seen [1281](https://github.com/kruize/autotune/issues/1281), existing issues - [559](https://github.com/kruize/autotune/issues/559), [610](https://github.com/kruize/autotune/issues/610) |
+| 1 | Kruize Remote monitoring Functional testsuite | TOTAL - 359, PASSED - 316 / FAILED - 43 | TOTAL - 359, PASSED - 316 / FAILED - 43 | Intermittent issue seen [1281](https://github.com/kruize/autotune/issues/1281), existing issues - [559](https://github.com/kruize/autotune/issues/559), [610](https://github.com/kruize/autotune/issues/610) |
| 2 | Fault tolerant test | PASSED | PASSED | |
| 3 | Stress test | PASSED | PASSED | |
-| 4 | Scalability test (short run)| | | Exps - 5000, Results - 72000, execution time - 4 hours 4 mins |
+| 4 | Scalability test (short run)| | | Exps - 5000, Results - 72000, execution time - 4 hours 6 mins |
| 5 | DB Migration test | PASSED | PASSED | Tested on Openshift |
| 6 | Recommendation and box plot values validations | PASSED | PASSED | |
| 7 | Kruize remote monitoring demo | PASSED | PASSED | Tested manually |
| 8 | Kruize Local monitoring demo | PASSED | PASSED | |
-| 9 | Kruize Local Functional tests | TOTAL - 77, PASSED - 72 / FAILED - 3 | TOTAL - 77, PASSED - 72 / FAILED - 5 | [Issue 1217](https://github.com/kruize/autotune/issues/1217), [Issue 1273](https://github.com/kruize/autotune/issues/1273) |
+| 9 | Kruize Local Functional tests | TOTAL - 78, PASSED - 75 / FAILED - 3 | TOTAL - 78, PASSED - 75 / FAILED - 3 | [Issue 1217](https://github.com/kruize/autotune/issues/1217), [Issue 1273](https://github.com/kruize/autotune/issues/1273) |
---
From 5233b341f0a22ec8bb24b6ce20b189e23ce24ca7 Mon Sep 17 00:00:00 2001
From: bharathappali
Date: Thu, 3 Oct 2024 11:38:31 +0530
Subject: [PATCH 04/64] Make changes to performance profile
Signed-off-by: bharathappali
---
...esource_optimization_local_monitoring.yaml | 324 ++++++++++--------
1 file changed, 182 insertions(+), 142 deletions(-)
diff --git a/manifests/autotune/performance-profiles/resource_optimization_local_monitoring.yaml b/manifests/autotune/performance-profiles/resource_optimization_local_monitoring.yaml
index e638c07e9..92a68a6b2 100644
--- a/manifests/autotune/performance-profiles/resource_optimization_local_monitoring.yaml
+++ b/manifests/autotune/performance-profiles/resource_optimization_local_monitoring.yaml
@@ -247,167 +247,207 @@ slo:
- function: max
query: 'max by(namespace,container) (last_over_time((timestamp(container_cpu_usage_seconds_total{namespace="$NAMESPACE$", container="$CONTAINER_NAME$"} > 0))[15d:]))'
- ## namespace related queries
-
- # Namespace quota for CPU requests
- # Show namespace quota for CPU requests in cores for a namespace
- - name: namespaceCpuRequest
- datasource: prometheus
- value_type: "double"
- kubernetes_object: "namespace"
- aggregation_functions:
- # sum of all cpu request quotas for a namespace in cores
- - function: sum
- query: 'sum by (namespace) (kube_resourcequota{namespace="$NAMESPACE$", resource="requests.cpu", type="hard"})'
-
- # Namespace quota for CPU limits
- # Show namespace quota for CPU limits in cores for a namespace
- - name: namespaceCpuLimit
- datasource: prometheus
- value_type: "double"
- kubernetes_object: "namespace"
- aggregation_functions:
- # sum of all cpu limits quotas for a namespace in cores
- - function: sum
- query: 'sum by (namespace) (kube_resourcequota{namespace="$NAMESPACE$", resource="limits.cpu", type="hard"})'
-
-
- # Namespace quota for memory requests
- # Show namespace quota for memory requests in bytes for a namespace
- - name: namespaceMemoryRequest
- datasource: prometheus
- value_type: "double"
- kubernetes_object: "namespace"
- aggregation_functions:
- # sum of all memory requests quotas for a namespace in bytes
- - function: sum
- query: 'sum by (namespace) (kube_resourcequota{namespace="$NAMESPACE$", resource="requests.memory", type="hard"})'
-
-
- # Namespace quota for memory limits
- # Show namespace quota for memory limits in bytes for a namespace
- - name: namespaceMemoryLimit
- datasource: prometheus
- value_type: "double"
- kubernetes_object: "namespace"
- aggregation_functions:
- # sum of all memory limits quotas for a namespace in bytes
- - function: sum
- query: 'sum by (namespace) (kube_resourcequota{namespace="$NAMESPACE$", resource="limits.memory", type="hard"})'
-
-
- # Namespace CPU usage
- # Show cpu usages in cores for a namespace
- - name: namespaceCpuUsage
- datasource: prometheus
- value_type: "double"
- kubernetes_object: "namespace"
- aggregation_functions:
- # average cpu usages in cores for a namespace
- - function: avg
- query: 'avg_over_time(sum by(namespace) (node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{namespace="$NAMESPACE$", container!="", container!="POD", pod!=""})[$MEASUREMENT_DURATION_IN_MIN$m:])'
+ ## namespace related queries
- # maximum cpu usages in cores for a namespace
- - function: max
- query: 'max_over_time(sum by(namespace) (node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{namespace="$NAMESPACE$", container!="", container!="POD", pod!=""})[$MEASUREMENT_DURATION_IN_MIN$m:])'
+ # Namespace quota for CPU requests
+ # Show namespace quota for CPU requests in cores for a namespace
+ - name: namespaceCpuRequest
+ datasource: prometheus
+ value_type: "double"
+ kubernetes_object: "namespace"
+ aggregation_functions:
+ # sum of all cpu request quotas for a namespace in cores
+ - function: sum
+ query: 'sum by (namespace) (kube_resourcequota{namespace="$NAMESPACE$", resource="requests.cpu", type="hard"})'
- # minimum cpu usages in cores for a namespace
- - function: min
- query: 'min_over_time(sum by(namespace) (node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{namespace="$NAMESPACE$", container!="", container!="POD", pod!=""})[$MEASUREMENT_DURATION_IN_MIN$m:])'
+ # Namespace quota for CPU limits
+ # Show namespace quota for CPU limits in cores for a namespace
+ - name: namespaceCpuLimit
+ datasource: prometheus
+ value_type: "double"
+ kubernetes_object: "namespace"
+ aggregation_functions:
+ # sum of all cpu limits quotas for a namespace in cores
+ - function: sum
+ query: 'sum by (namespace) (kube_resourcequota{namespace="$NAMESPACE$", resource="limits.cpu", type="hard"})'
- # Namespace CPU Throttle
- # Show cpu throttle in cores for a namespace
- - name: namespaceCpuThrottle
- datasource: prometheus
- value_type: "double"
- kubernetes_object: "namespace"
- aggregation_functions:
- # average cpu throttle in cores for a namespace
- - function: avg
- query: 'avg_over_time(sum by(namespace) (rate(container_cpu_cfs_throttled_seconds_total{namespace="$NAMESPACE$", container!="", container!="POD", pod!=""}[5m]))[$MEASUREMENT_DURATION_IN_MIN$m:])'
+ # Namespace quota for memory requests
+ # Show namespace quota for memory requests in bytes for a namespace
+ - name: namespaceMemoryRequest
+ datasource: prometheus
+ value_type: "double"
+ kubernetes_object: "namespace"
+ aggregation_functions:
+ # sum of all memory requests quotas for a namespace in bytes
+ - function: sum
+ query: 'sum by (namespace) (kube_resourcequota{namespace="$NAMESPACE$", resource="requests.memory", type="hard"})'
- # maximum cpu throttle in cores for a namespace
- - function: max
- query: 'max_over_time(sum by(namespace) (rate(container_cpu_cfs_throttled_seconds_total{namespace="$NAMESPACE$", container!="", container!="POD", pod!=""}[5m]))[$MEASUREMENT_DURATION_IN_MIN$m:])'
- # minimum cpu throttle in cores for a namespace
- - function: min
- query: 'min_over_time(sum by(namespace) (rate(container_cpu_cfs_throttled_seconds_total{namespace="$NAMESPACE$", container!="", container!="POD", pod!=""}[5m]))[$MEASUREMENT_DURATION_IN_MIN$m:])'
+ # Namespace quota for memory limits
+ # Show namespace quota for memory limits in bytes for a namespace
+ - name: namespaceMemoryLimit
+ datasource: prometheus
+ value_type: "double"
+ kubernetes_object: "namespace"
+ aggregation_functions:
+ # sum of all memory limits quotas for a namespace in bytes
+ - function: sum
+ query: 'sum by (namespace) (kube_resourcequota{namespace="$NAMESPACE$", resource="limits.memory", type="hard"})'
- # Namespace memory usage
- # Show memory usages in bytes for a namespace
- - name: namespaceMemoryUsage
- datasource: prometheus
- value_type: "double"
- kubernetes_object: "namespace"
- aggregation_functions:
- # average memory usage in bytes for a namespace
- - function: avg
- query: 'avg_over_time(sum by(namespace) (container_memory_working_set_bytes{namespace="$NAMESPACE$", container!="", container!="POD", pod!=""})[$MEASUREMENT_DURATION_IN_MIN$m:])'
+ # Namespace CPU usage
+ # Show cpu usages in cores for a namespace
+ - name: namespaceCpuUsage
+ datasource: prometheus
+ value_type: "double"
+ kubernetes_object: "namespace"
+ aggregation_functions:
+ # average cpu usages in cores for a namespace
+ - function: avg
+ query: 'avg_over_time(sum by(namespace) (node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{namespace="$NAMESPACE$", container!="", container!="POD", pod!=""})[$MEASUREMENT_DURATION_IN_MIN$m:])'
- # maximum memory usage in bytes for a namespace
- - function: max
- query: 'max_over_time(sum by(namespace) (container_memory_working_set_bytes{namespace="$NAMESPACE$", container!="", container!="POD", pod!=""})[$MEASUREMENT_DURATION_IN_MIN$m:])'
+ # maximum cpu usages in cores for a namespace
+ - function: max
+ query: 'max_over_time(sum by(namespace) (node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{namespace="$NAMESPACE$", container!="", container!="POD", pod!=""})[$MEASUREMENT_DURATION_IN_MIN$m:])'
+
+ # minimum cpu usages in cores for a namespace
+ - function: min
+ query: 'min_over_time(sum by(namespace) (node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{namespace="$NAMESPACE$", container!="", container!="POD", pod!=""})[$MEASUREMENT_DURATION_IN_MIN$m:])'
+
+
+ # Namespace CPU Throttle
+ # Show cpu throttle in cores for a namespace
+ - name: namespaceCpuThrottle
+ datasource: prometheus
+ value_type: "double"
+ kubernetes_object: "namespace"
+ aggregation_functions:
+ # average cpu throttle in cores for a namespace
+ - function: avg
+ query: 'avg_over_time(sum by(namespace) (rate(container_cpu_cfs_throttled_seconds_total{namespace="$NAMESPACE$", container!="", container!="POD", pod!=""}[5m]))[$MEASUREMENT_DURATION_IN_MIN$m:])'
+
+ # maximum cpu throttle in cores for a namespace
+ - function: max
+ query: 'max_over_time(sum by(namespace) (rate(container_cpu_cfs_throttled_seconds_total{namespace="$NAMESPACE$", container!="", container!="POD", pod!=""}[5m]))[$MEASUREMENT_DURATION_IN_MIN$m:])'
+
+ # minimum cpu throttle in cores for a namespace
+ - function: min
+ query: 'min_over_time(sum by(namespace) (rate(container_cpu_cfs_throttled_seconds_total{namespace="$NAMESPACE$", container!="", container!="POD", pod!=""}[5m]))[$MEASUREMENT_DURATION_IN_MIN$m:])'
- # minimum memory usage in bytes for a namespace
- - function: min
- query: 'min_over_time(sum by(namespace) (container_memory_working_set_bytes{namespace="$NAMESPACE$", container!="", container!="POD", pod!=""})[$MEASUREMENT_DURATION_IN_MIN$m:])'
+ # Namespace memory usage
+ # Show memory usages in bytes for a namespace
+ - name: namespaceMemoryUsage
+ datasource: prometheus
+ value_type: "double"
+ kubernetes_object: "namespace"
+ aggregation_functions:
+ # average memory usage in bytes for a namespace
+ - function: avg
+ query: 'avg_over_time(sum by(namespace) (container_memory_working_set_bytes{namespace="$NAMESPACE$", container!="", container!="POD", pod!=""})[$MEASUREMENT_DURATION_IN_MIN$m:])'
+
+ # maximum memory usage in bytes for a namespace
+ - function: max
+ query: 'max_over_time(sum by(namespace) (container_memory_working_set_bytes{namespace="$NAMESPACE$", container!="", container!="POD", pod!=""})[$MEASUREMENT_DURATION_IN_MIN$m:])'
- # Namespace memory rss value
- # Show memory rss in bytes for a namespace
- - name: namespaceMemoryRSS
- datasource: prometheus
- value_type: "double"
- kubernetes_object: "namespace"
- aggregation_functions:
- # average memory rss in bytes for a namespace
+ # minimum memory usage in bytes for a namespace
+ - function: min
+ query: 'min_over_time(sum by(namespace) (container_memory_working_set_bytes{namespace="$NAMESPACE$", container!="", container!="POD", pod!=""})[$MEASUREMENT_DURATION_IN_MIN$m:])'
+
+
+ # Namespace memory rss value
+ # Show memory rss in bytes for a namespace
+ - name: namespaceMemoryRSS
+ datasource: prometheus
+ value_type: "double"
+ kubernetes_object: "namespace"
+ aggregation_functions:
+ # average memory rss in bytes for a namespace
+ - function: avg
+ query: 'avg_over_time(sum by(namespace) (container_memory_rss{namespace="$NAMESPACE$", container!="", container!="POD", pod!=""})[$MEASUREMENT_DURATION_IN_MIN$m:])'
+
+ # maximum memory rss in bytes for a namespace
+ - function: max
+ query: 'max_over_time(sum by(namespace) (container_memory_rss{namespace="$NAMESPACE$", container!="", container!="POD", pod!=""})[$MEASUREMENT_DURATION_IN_MIN$m:])'
+
+ # minimum memory rss in bytes for a namespace
+ - function: min
+ query: 'min_over_time(sum by(namespace) (container_memory_rss{namespace="$NAMESPACE$", container!="", container!="POD", pod!=""})[$MEASUREMENT_DURATION_IN_MIN$m:])'
+
+
+ # Show total pods in a namespace
+ - name: namespaceTotalPods
+ datasource: prometheus
+ value_type: "double"
+ kubernetes_object: "namespace"
+ aggregation_functions:
+ # maximum total pods in a namespace
+ - function: max
+ query: 'max_over_time(sum by(namespace) ((kube_pod_info{namespace="$NAMESPACE$"}))[$MEASUREMENT_DURATION_IN_MIN$m:])'
+ # average total pods in a namespace
+ - function: avg
+ query: 'avg_over_time(sum by(namespace) ((kube_pod_info{namespace="$NAMESPACE$"}))[$MEASUREMENT_DURATION_IN_MIN$m:])'
+
+
+ # Show total running pods in a namespace
+ - name: namespaceRunningPods
+ datasource: prometheus
+ value_type: "double"
+ kubernetes_object: "namespace"
+ aggregation_functions:
+ # maximum total pods in a namespace
+ - function: max
+ query: 'max_over_time(sum by(namespace) ((kube_pod_status_phase{phase="Running"}))[$MEASUREMENT_DURATION_IN_MIN$m:])'
+ # average total pods in a namespace
+ - function: avg
+ query: 'avg_over_time(sum by(namespace) ((kube_pod_status_phase{phase="Running"}))[$MEASUREMENT_DURATION_IN_MIN$m:])'
+
+ # Show last activity for a namespace
+ - name: namespaceMaxDate
+ datasource: prometheus
+ value_type: "double"
+ kubernetes_object: "namespace"
+ aggregation_functions:
+ - function: max
+ query: 'max(last_over_time(timestamp((sum by (namespace) (container_cpu_usage_seconds_total{namespace="$NAMESPACE$"})) > 0 )[15d:]))'
+
+ # GPU Related metrics
+
+ # GPU Core Usage
+ - name: gpuCoreUsage
+ datasource: prometheus
+ value_type: "double"
+ kubernetes_object: "container"
+
+ aggregation_functions:
+ # Average GPU Core Usage Percentage per container in a deployment
- function: avg
- query: 'avg_over_time(sum by(namespace) (container_memory_rss{namespace="$NAMESPACE$", container!="", container!="POD", pod!=""})[$MEASUREMENT_DURATION_IN_MIN$m:])'
+ query: 'avg by (Hostname,device,modelName,UUID,exported_container,exported_namespace) (avg_over_time(DCGM_FI_DEV_GPU_UTIL{exported_namespace="$NAMESPACE$",exported_container="$CONTAINER_NAME$"}[$MEASUREMENT_DURATION_IN_MIN$m])'
- # maximum memory rss in bytes for a namespace
+ # Maximum GPU Core Usage Percentage per container in a deployment
- function: max
- query: 'max_over_time(sum by(namespace) (container_memory_rss{namespace="$NAMESPACE$", container!="", container!="POD", pod!=""})[$MEASUREMENT_DURATION_IN_MIN$m:])'
+ query: 'max by (Hostname,device,modelName,UUID,exported_container,exported_namespace) (max_over_time(DCGM_FI_DEV_GPU_UTIL{exported_namespace="$NAMESPACE$",exported_container="$CONTAINER_NAME$"}[$MEASUREMENT_DURATION_IN_MIN$m])'
- # minimum memory rss in bytes for a namespace
+ # Minimum of GPU Core Usage Percentage for a container in a deployment
- function: min
- query: 'min_over_time(sum by(namespace) (container_memory_rss{namespace="$NAMESPACE$", container!="", container!="POD", pod!=""})[$MEASUREMENT_DURATION_IN_MIN$m:])'
+ query: 'min by (Hostname,device,modelName,UUID,exported_container,exported_namespace) (min_over_time(DCGM_FI_DEV_GPU_UTIL{exported_namespace="$NAMESPACE$",exported_container="$CONTAINER_NAME$"}[$MEASUREMENT_DURATION_IN_MIN$m])'
+ # GPU Memory usage
+ - name: gpuMemoryUsage
+ datasource: prometheus
+ value_type: "double"
+ kubernetes_object: "container"
- # Show total pods in a namespace
- - name: namespaceTotalPods
- datasource: prometheus
- value_type: "double"
- kubernetes_object: "namespace"
- aggregation_functions:
- # maximum total pods in a namespace
- - function: max
- query: 'max_over_time(sum by(namespace) ((kube_pod_info{namespace="$NAMESPACE$"}))[$MEASUREMENT_DURATION_IN_MIN$m:])'
- # average total pods in a namespace
+ aggregation_functions:
+ # Average GPU Memory Usage Percentage per container in a deployment
- function: avg
- query: 'avg_over_time(sum by(namespace) ((kube_pod_info{namespace="$NAMESPACE$"}))[$MEASUREMENT_DURATION_IN_MIN$m:])'
+ query: 'avg by (Hostname,device,modelName,UUID,exported_container,exported_namespace) (avg_over_time(DCGM_FI_DEV_MEM_COPY_UTIL{exported_namespace="$NAMESPACE$",exported_container="$CONTAINER_NAME$"}[$MEASUREMENT_DURATION_IN_MIN$m])'
-
- # Show total running pods in a namespace
- - name: namespaceRunningPods
- datasource: prometheus
- value_type: "double"
- kubernetes_object: "namespace"
- aggregation_functions:
- # maximum total pods in a namespace
- - function: max
- query: 'max_over_time(sum by(namespace) ((kube_pod_status_phase{phase="Running"}))[$MEASUREMENT_DURATION_IN_MIN$m:])'
- # average total pods in a namespace
- - function: avg
- query: 'avg_over_time(sum by(namespace) ((kube_pod_status_phase{phase="Running"}))[$MEASUREMENT_DURATION_IN_MIN$m:])'
-
- # Show last activity for a namespace
- - name: namespaceMaxDate
- datasource: prometheus
- value_type: "double"
- kubernetes_object: "namespace"
- aggregation_functions:
+ # Maximum GPU Memory Usage Percentage per container in a deployment
- function: max
- query: 'max(last_over_time(timestamp((sum by (namespace) (container_cpu_usage_seconds_total{namespace="$NAMESPACE$"})) > 0 )[15d:]))'
+ query: 'max by (Hostname,device,modelName,UUID,exported_container,exported_namespace) (max_over_time(DCGM_FI_DEV_MEM_COPY_UTIL{exported_namespace="$NAMESPACE$",exported_container="$CONTAINER_NAME$"}[$MEASUREMENT_DURATION_IN_MIN$m])'
+
+ # Minimum of GPU Memory Usage Percentage for a container in a deployment
+ - function: min
+ query: 'min by (Hostname,device,modelName,UUID,exported_container,exported_namespace) (min_over_time(DCGM_FI_DEV_MEM_COPY_UTIL{exported_namespace="$NAMESPACE$",exported_container="$CONTAINER_NAME$"}[$MEASUREMENT_DURATION_IN_MIN$m])'
\ No newline at end of file
From 2d2858f4e2d0fe37228f6ddb161ecc4fc748f7ad Mon Sep 17 00:00:00 2001
From: bharathappali
Date: Thu, 3 Oct 2024 15:01:11 +0530
Subject: [PATCH 05/64] Add queries to non recording rules yaml and in JSON
files
Signed-off-by: bharathappali
---
...esource_optimization_local_monitoring.json | 40 +++
...ion_local_monitoring_norecordingrules.json | 40 +++
...ion_local_monitoring_norecordingrules.yaml | 325 ++++++++++--------
3 files changed, 263 insertions(+), 142 deletions(-)
diff --git a/manifests/autotune/performance-profiles/resource_optimization_local_monitoring.json b/manifests/autotune/performance-profiles/resource_optimization_local_monitoring.json
index add7fd4ca..d2e243127 100644
--- a/manifests/autotune/performance-profiles/resource_optimization_local_monitoring.json
+++ b/manifests/autotune/performance-profiles/resource_optimization_local_monitoring.json
@@ -412,6 +412,46 @@
"query": "max(last_over_time(timestamp((sum by (namespace) (container_cpu_usage_seconds_total{namespace=\"$NAMESPACE$\"})) > 0 )[15d:]))"
}
]
+ },
+ {
+ "name": "gpuCoreUsage",
+ "datasource": "prometheus",
+ "value_type": "double",
+ "kubernetes_object": "container",
+ "aggregation_functions": [
+ {
+ "function": "avg",
+ "query": "avg by (Hostname,device,modelName,UUID,exported_container,exported_namespace) (avg_over_time(DCGM_FI_DEV_GPU_UTIL{exported_namespace=\"$NAMESPACE$\",exported_container=\"$CONTAINER_NAME$\"}[$MEASUREMENT_DURATION_IN_MIN$m]))"
+ },
+ {
+ "function": "max",
+ "query": "max by (Hostname,device,modelName,UUID,exported_container,exported_namespace) (max_over_time(DCGM_FI_DEV_GPU_UTIL{exported_namespace=\"$NAMESPACE$\",exported_container=\"$CONTAINER_NAME$\"}[$MEASUREMENT_DURATION_IN_MIN$m]))"
+ },
+ {
+ "function": "min",
+ "query": "min by (Hostname,device,modelName,UUID,exported_container,exported_namespace) (min_over_time(DCGM_FI_DEV_GPU_UTIL{exported_namespace=\"$NAMESPACE$\",exported_container=\"$CONTAINER_NAME$\"}[$MEASUREMENT_DURATION_IN_MIN$m]))"
+ }
+ ]
+ },
+ {
+ "name": "gpuMemoryUsage",
+ "datasource": "prometheus",
+ "value_type": "double",
+ "kubernetes_object": "container",
+ "aggregation_functions": [
+ {
+ "function": "avg",
+ "query": "avg by (Hostname,device,modelName,UUID,exported_container,exported_namespace) (avg_over_time(DCGM_FI_DEV_MEM_COPY_UTIL{exported_namespace=\"$NAMESPACE$\",exported_container=\"$CONTAINER_NAME$\"}[$MEASUREMENT_DURATION_IN_MIN$m]))"
+ },
+ {
+ "function": "max",
+ "query": "max by (Hostname,device,modelName,UUID,exported_container,exported_namespace) (max_over_time(DCGM_FI_DEV_MEM_COPY_UTIL{exported_namespace=\"$NAMESPACE$\",exported_container=\"$CONTAINER_NAME$\"}[$MEASUREMENT_DURATION_IN_MIN$m]))"
+ },
+ {
+ "function": "min",
+ "query": "min by (Hostname,device,modelName,UUID,exported_container,exported_namespace) (min_over_time(DCGM_FI_DEV_MEM_COPY_UTIL{exported_namespace=\"$NAMESPACE$\",exported_container=\"$CONTAINER_NAME$\"}[$MEASUREMENT_DURATION_IN_MIN$m]))"
+ }
+ ]
}
]
}
diff --git a/manifests/autotune/performance-profiles/resource_optimization_local_monitoring_norecordingrules.json b/manifests/autotune/performance-profiles/resource_optimization_local_monitoring_norecordingrules.json
index eeef1a07e..4f4d261ae 100644
--- a/manifests/autotune/performance-profiles/resource_optimization_local_monitoring_norecordingrules.json
+++ b/manifests/autotune/performance-profiles/resource_optimization_local_monitoring_norecordingrules.json
@@ -389,6 +389,46 @@
"query": "max(last_over_time(timestamp((sum by (namespace) (container_cpu_usage_seconds_total{namespace=\"$NAMESPACE$\"})) > 0 )[15d:]))"
}
]
+ },
+ {
+ "name": "gpuCoreUsage",
+ "datasource": "prometheus",
+ "value_type": "double",
+ "kubernetes_object": "container",
+ "aggregation_functions": [
+ {
+ "function": "avg",
+ "query": "avg by (Hostname,device,modelName,UUID,exported_container,exported_namespace) (avg_over_time(DCGM_FI_DEV_GPU_UTIL{exported_namespace=\"$NAMESPACE$\",exported_container=\"$CONTAINER_NAME$\"}[$MEASUREMENT_DURATION_IN_MIN$m]))"
+ },
+ {
+ "function": "max",
+ "query": "max by (Hostname,device,modelName,UUID,exported_container,exported_namespace) (max_over_time(DCGM_FI_DEV_GPU_UTIL{exported_namespace=\"$NAMESPACE$\",exported_container=\"$CONTAINER_NAME$\"}[$MEASUREMENT_DURATION_IN_MIN$m]))"
+ },
+ {
+ "function": "min",
+ "query": "min by (Hostname,device,modelName,UUID,exported_container,exported_namespace) (min_over_time(DCGM_FI_DEV_GPU_UTIL{exported_namespace=\"$NAMESPACE$\",exported_container=\"$CONTAINER_NAME$\"}[$MEASUREMENT_DURATION_IN_MIN$m]))"
+ }
+ ]
+ },
+ {
+ "name": "gpuMemoryUsage",
+ "datasource": "prometheus",
+ "value_type": "double",
+ "kubernetes_object": "container",
+ "aggregation_functions": [
+ {
+ "function": "avg",
+ "query": "avg by (Hostname,device,modelName,UUID,exported_container,exported_namespace) (avg_over_time(DCGM_FI_DEV_MEM_COPY_UTIL{exported_namespace=\"$NAMESPACE$\",exported_container=\"$CONTAINER_NAME$\"}[$MEASUREMENT_DURATION_IN_MIN$m]))"
+ },
+ {
+ "function": "max",
+ "query": "max by (Hostname,device,modelName,UUID,exported_container,exported_namespace) (max_over_time(DCGM_FI_DEV_MEM_COPY_UTIL{exported_namespace=\"$NAMESPACE$\",exported_container=\"$CONTAINER_NAME$\"}[$MEASUREMENT_DURATION_IN_MIN$m]))"
+ },
+ {
+ "function": "min",
+ "query": "min by (Hostname,device,modelName,UUID,exported_container,exported_namespace) (min_over_time(DCGM_FI_DEV_MEM_COPY_UTIL{exported_namespace=\"$NAMESPACE$\",exported_container=\"$CONTAINER_NAME$\"}[$MEASUREMENT_DURATION_IN_MIN$m]))"
+ }
+ ]
}
]
}
diff --git a/manifests/autotune/performance-profiles/resource_optimization_local_monitoring_norecordingrules.yaml b/manifests/autotune/performance-profiles/resource_optimization_local_monitoring_norecordingrules.yaml
index 8a85c70e7..d50d42df1 100644
--- a/manifests/autotune/performance-profiles/resource_optimization_local_monitoring_norecordingrules.yaml
+++ b/manifests/autotune/performance-profiles/resource_optimization_local_monitoring_norecordingrules.yaml
@@ -210,168 +210,209 @@ slo:
- function: max
query: 'max by(namespace,container) (last_over_time((timestamp(container_cpu_usage_seconds_total{namespace="$NAMESPACE$", container="$CONTAINER_NAME$"} > 0))[15d:]))'
- ## namespace related queries
-
- # Namespace quota for CPU requests
- # Show namespace quota for CPU requests in cores for a namespace
- - name: namespaceCpuRequest
- datasource: prometheus
- value_type: "double"
- kubernetes_object: "namespace"
- aggregation_functions:
- # sum of all cpu request quotas for a namespace in cores
- - function: sum
- query: 'sum by (namespace) (kube_resourcequota{namespace="$NAMESPACE$", resource="requests.cpu", type="hard"})'
-
- # Namespace quota for CPU limits
- # Show namespace quota for CPU limits in cores for a namespace
- - name: namespaceCpuLimit
- datasource: prometheus
- value_type: "double"
- kubernetes_object: "namespace"
- aggregation_functions:
- # sum of all cpu limits quotas for a namespace in cores
- - function: sum
- query: 'sum by (namespace) (kube_resourcequota{namespace="$NAMESPACE$", resource="limits.cpu", type="hard"})'
-
-
- # Namespace quota for memory requests
- # Show namespace quota for memory requests in bytes for a namespace
- - name: namespaceMemoryRequest
- datasource: prometheus
- value_type: "double"
- kubernetes_object: "namespace"
- aggregation_functions:
- # sum of all memory requests quotas for a namespace in bytes
- - function: sum
- query: 'sum by (namespace) (kube_resourcequota{namespace="$NAMESPACE$", resource="requests.memory", type="hard"})'
-
-
- # Namespace quota for memory limits
- # Show namespace quota for memory limits in bytes for a namespace
- - name: namespaceMemoryLimit
- datasource: prometheus
- value_type: "double"
- kubernetes_object: "namespace"
- aggregation_functions:
- # sum of all memory limits quotas for a namespace in bytes
- - function: sum
- query: 'sum by (namespace) (kube_resourcequota{namespace="$NAMESPACE$", resource="limits.memory", type="hard"})'
-
-
- # Namespace CPU usage
- # Show cpu usages in cores for a namespace
- - name: namespaceCpuUsage
- datasource: prometheus
- value_type: "double"
- kubernetes_object: "namespace"
- aggregation_functions:
- # average cpu usages in cores for a namespace
- - function: avg
- query: 'avg_over_time(sum by(namespace) (rate(container_cpu_usage_seconds_total{namespace="$NAMESPACE$", container!="", container!="POD", pod!=""}[5m]) )[$MEASUREMENT_DURATION_IN_MIN$m:])'
+ ## namespace related queries
- # maximum cpu usages in cores for a namespace
- - function: max
- query: 'max_over_time(sum by(namespace) (rate(container_cpu_usage_seconds_total{namespace="$NAMESPACE$", container!="", container!="POD", pod!=""}[5m]) )[$MEASUREMENT_DURATION_IN_MIN$m:])'
+ # Namespace quota for CPU requests
+ # Show namespace quota for CPU requests in cores for a namespace
+ - name: namespaceCpuRequest
+ datasource: prometheus
+ value_type: "double"
+ kubernetes_object: "namespace"
+ aggregation_functions:
+ # sum of all cpu request quotas for a namespace in cores
+ - function: sum
+ query: 'sum by (namespace) (kube_resourcequota{namespace="$NAMESPACE$", resource="requests.cpu", type="hard"})'
- # minimum cpu usages in cores for a namespace
- - function: min
- query: 'min_over_time(sum by(namespace) (rate(container_cpu_usage_seconds_total{namespace="$NAMESPACE$", container!="", container!="POD", pod!=""}[5m]) )[$MEASUREMENT_DURATION_IN_MIN$m:])'
+ # Namespace quota for CPU limits
+ # Show namespace quota for CPU limits in cores for a namespace
+ - name: namespaceCpuLimit
+ datasource: prometheus
+ value_type: "double"
+ kubernetes_object: "namespace"
+ aggregation_functions:
+ # sum of all cpu limits quotas for a namespace in cores
+ - function: sum
+ query: 'sum by (namespace) (kube_resourcequota{namespace="$NAMESPACE$", resource="limits.cpu", type="hard"})'
- # Namespace CPU Throttle
- # Show cpu throttle in cores for a namespace
- - name: namespaceCpuThrottle
- datasource: prometheus
- value_type: "double"
- kubernetes_object: "namespace"
- aggregation_functions:
- # average cpu throttle in cores for a namespace
- - function: avg
- query: 'avg_over_time(sum by(namespace) (rate(container_cpu_cfs_throttled_seconds_total{namespace="$NAMESPACE$", container!="", container!="POD", pod!=""}[5m]))[$MEASUREMENT_DURATION_IN_MIN$m:])'
+ # Namespace quota for memory requests
+ # Show namespace quota for memory requests in bytes for a namespace
+ - name: namespaceMemoryRequest
+ datasource: prometheus
+ value_type: "double"
+ kubernetes_object: "namespace"
+ aggregation_functions:
+ # sum of all memory requests quotas for a namespace in bytes
+ - function: sum
+ query: 'sum by (namespace) (kube_resourcequota{namespace="$NAMESPACE$", resource="requests.memory", type="hard"})'
- # maximum cpu throttle in cores for a namespace
- - function: max
- query: 'max_over_time(sum by(namespace) (rate(container_cpu_cfs_throttled_seconds_total{namespace="$NAMESPACE$", container!="", container!="POD", pod!=""}[5m]))[$MEASUREMENT_DURATION_IN_MIN$m:])'
- # minimum cpu throttle in cores for a namespace
- - function: min
- query: 'min_over_time(sum by(namespace) (rate(container_cpu_cfs_throttled_seconds_total{namespace="$NAMESPACE$", container!="", container!="POD", pod!=""}[5m]))[$MEASUREMENT_DURATION_IN_MIN$m:])'
+ # Namespace quota for memory limits
+ # Show namespace quota for memory limits in bytes for a namespace
+ - name: namespaceMemoryLimit
+ datasource: prometheus
+ value_type: "double"
+ kubernetes_object: "namespace"
+ aggregation_functions:
+ # sum of all memory limits quotas for a namespace in bytes
+ - function: sum
+ query: 'sum by (namespace) (kube_resourcequota{namespace="$NAMESPACE$", resource="limits.memory", type="hard"})'
- # Namespace memory usage
- # Show memory usages in bytes for a namespace
- - name: namespaceMemoryUsage
- datasource: prometheus
- value_type: "double"
- kubernetes_object: "namespace"
- aggregation_functions:
- # average memory usage in bytes for a namespace
- - function: avg
- query: 'avg_over_time(sum by(namespace) (container_memory_working_set_bytes{namespace="$NAMESPACE$", container!="", container!="POD", pod!=""})[$MEASUREMENT_DURATION_IN_MIN$m:])'
+ # Namespace CPU usage
+ # Show cpu usages in cores for a namespace
+ - name: namespaceCpuUsage
+ datasource: prometheus
+ value_type: "double"
+ kubernetes_object: "namespace"
+ aggregation_functions:
+ # average cpu usages in cores for a namespace
+ - function: avg
+ query: 'avg_over_time(sum by(namespace) (rate(container_cpu_usage_seconds_total{namespace="$NAMESPACE$", container!="", container!="POD", pod!=""}[5m]) )[$MEASUREMENT_DURATION_IN_MIN$m:])'
- # maximum memory usage in bytes for a namespace
- - function: max
- query: 'max_over_time(sum by(namespace) (container_memory_working_set_bytes{namespace="$NAMESPACE$", container!="", container!="POD", pod!=""})[$MEASUREMENT_DURATION_IN_MIN$m:])'
+ # maximum cpu usages in cores for a namespace
+ - function: max
+ query: 'max_over_time(sum by(namespace) (rate(container_cpu_usage_seconds_total{namespace="$NAMESPACE$", container!="", container!="POD", pod!=""}[5m]) )[$MEASUREMENT_DURATION_IN_MIN$m:])'
+
+ # minimum cpu usages in cores for a namespace
+ - function: min
+ query: 'min_over_time(sum by(namespace) (rate(container_cpu_usage_seconds_total{namespace="$NAMESPACE$", container!="", container!="POD", pod!=""}[5m]) )[$MEASUREMENT_DURATION_IN_MIN$m:])'
+
+
+ # Namespace CPU Throttle
+ # Show cpu throttle in cores for a namespace
+ - name: namespaceCpuThrottle
+ datasource: prometheus
+ value_type: "double"
+ kubernetes_object: "namespace"
+ aggregation_functions:
+ # average cpu throttle in cores for a namespace
+ - function: avg
+ query: 'avg_over_time(sum by(namespace) (rate(container_cpu_cfs_throttled_seconds_total{namespace="$NAMESPACE$", container!="", container!="POD", pod!=""}[5m]))[$MEASUREMENT_DURATION_IN_MIN$m:])'
+
+ # maximum cpu throttle in cores for a namespace
+ - function: max
+ query: 'max_over_time(sum by(namespace) (rate(container_cpu_cfs_throttled_seconds_total{namespace="$NAMESPACE$", container!="", container!="POD", pod!=""}[5m]))[$MEASUREMENT_DURATION_IN_MIN$m:])'
+
+ # minimum cpu throttle in cores for a namespace
+ - function: min
+ query: 'min_over_time(sum by(namespace) (rate(container_cpu_cfs_throttled_seconds_total{namespace="$NAMESPACE$", container!="", container!="POD", pod!=""}[5m]))[$MEASUREMENT_DURATION_IN_MIN$m:])'
+
+
+ # Namespace memory usage
+ # Show memory usages in bytes for a namespace
+ - name: namespaceMemoryUsage
+ datasource: prometheus
+ value_type: "double"
+ kubernetes_object: "namespace"
+ aggregation_functions:
+ # average memory usage in bytes for a namespace
+ - function: avg
+ query: 'avg_over_time(sum by(namespace) (container_memory_working_set_bytes{namespace="$NAMESPACE$", container!="", container!="POD", pod!=""})[$MEASUREMENT_DURATION_IN_MIN$m:])'
+
+ # maximum memory usage in bytes for a namespace
+ - function: max
+ query: 'max_over_time(sum by(namespace) (container_memory_working_set_bytes{namespace="$NAMESPACE$", container!="", container!="POD", pod!=""})[$MEASUREMENT_DURATION_IN_MIN$m:])'
+
+ # minimum memory usage in bytes for a namespace
+ - function: min
+ query: 'min_over_time(sum by(namespace) (container_memory_working_set_bytes{namespace="$NAMESPACE$", container!="", container!="POD", pod!=""})[$MEASUREMENT_DURATION_IN_MIN$m:])'
+
+
+ # Namespace memory rss value
+ # Show memory rss in bytes for a namespace
+ - name: namespaceMemoryRSS
+ datasource: prometheus
+ value_type: "double"
+ kubernetes_object: "namespace"
+ aggregation_functions:
+ # average memory rss in bytes for a namespace
+ - function: avg
+ query: 'avg_over_time(sum by(namespace) (container_memory_rss{namespace="$NAMESPACE$", container!="", container!="POD", pod!=""})[$MEASUREMENT_DURATION_IN_MIN$m:])'
+
+ # maximum memory rss in bytes for a namespace
+ - function: max
+ query: 'max_over_time(sum by(namespace) (container_memory_rss{namespace="$NAMESPACE$", container!="", container!="POD", pod!=""})[$MEASUREMENT_DURATION_IN_MIN$m:])'
+
+ # minimum memory rss in bytes for a namespace
+ - function: min
+ query: 'min_over_time(sum by(namespace) (container_memory_rss{namespace="$NAMESPACE$", container!="", container!="POD", pod!=""})[$MEASUREMENT_DURATION_IN_MIN$m:])'
+
+
+ # Show total pods in a namespace
+ - name: namespaceTotalPods
+ datasource: prometheus
+ value_type: "double"
+ kubernetes_object: "namespace"
+ aggregation_functions:
+ # maximum total pods in a namespace
+ - function: max
+ query: 'max_over_time(sum by(namespace) ((kube_pod_info{namespace="$NAMESPACE$"}))[$MEASUREMENT_DURATION_IN_MIN$m:])'
+ # average total pods in a namespace
+ - function: avg
+ query: 'avg_over_time(sum by(namespace) ((kube_pod_info{namespace="$NAMESPACE$"}))[$MEASUREMENT_DURATION_IN_MIN$m:])'
+
+
+ # Show total running pods in a namespace
+ - name: namespaceRunningPods
+ datasource: prometheus
+ value_type: "double"
+ kubernetes_object: "namespace"
+ aggregation_functions:
+ # maximum total pods in a namespace
+ - function: max
+ query: 'max_over_time(sum by(namespace) ((kube_pod_status_phase{phase="Running"}))[$MEASUREMENT_DURATION_IN_MIN$m:])'
+ # average total pods in a namespace
+ - function: avg
+ query: 'avg_over_time(sum by(namespace) ((kube_pod_status_phase{phase="Running"}))[$MEASUREMENT_DURATION_IN_MIN$m:])'
+
+ # Show last activity for a namespace
+ - name: namespaceMaxDate
+ datasource: prometheus
+ value_type: "double"
+ kubernetes_object: "namespace"
+ aggregation_functions:
+ - function: max
+ query: 'max(last_over_time(timestamp((sum by (namespace) (container_cpu_usage_seconds_total{namespace="$NAMESPACE$"})) > 0 )[15d:]))'
- # minimum memory usage in bytes for a namespace
- - function: min
- query: 'min_over_time(sum by(namespace) (container_memory_working_set_bytes{namespace="$NAMESPACE$", container!="", container!="POD", pod!=""})[$MEASUREMENT_DURATION_IN_MIN$m:])'
+ # GPU Related metrics
+
+ # GPU Core Usage
+ - name: gpuCoreUsage
+ datasource: prometheus
+ value_type: "double"
+ kubernetes_object: "container"
- # Namespace memory rss value
- # Show memory rss in bytes for a namespace
- - name: namespaceMemoryRSS
- datasource: prometheus
- value_type: "double"
- kubernetes_object: "namespace"
- aggregation_functions:
- # average memory rss in bytes for a namespace
+ aggregation_functions:
+ # Average GPU Core Usage Percentage per container in a deployment
- function: avg
- query: 'avg_over_time(sum by(namespace) (container_memory_rss{namespace="$NAMESPACE$", container!="", container!="POD", pod!=""})[$MEASUREMENT_DURATION_IN_MIN$m:])'
+ query: 'avg by (Hostname,device,modelName,UUID,exported_container,exported_namespace) (avg_over_time(DCGM_FI_DEV_GPU_UTIL{exported_namespace="$NAMESPACE$",exported_container="$CONTAINER_NAME$"}[$MEASUREMENT_DURATION_IN_MIN$m])'
- # maximum memory rss in bytes for a namespace
+ # Maximum GPU Core Usage Percentage per container in a deployment
- function: max
- query: 'max_over_time(sum by(namespace) (container_memory_rss{namespace="$NAMESPACE$", container!="", container!="POD", pod!=""})[$MEASUREMENT_DURATION_IN_MIN$m:])'
+ query: 'max by (Hostname,device,modelName,UUID,exported_container,exported_namespace) (max_over_time(DCGM_FI_DEV_GPU_UTIL{exported_namespace="$NAMESPACE$",exported_container="$CONTAINER_NAME$"}[$MEASUREMENT_DURATION_IN_MIN$m])'
- # minimum memory rss in bytes for a namespace
+ # Minimum of GPU Core Usage Percentage for a container in a deployment
- function: min
- query: 'min_over_time(sum by(namespace) (container_memory_rss{namespace="$NAMESPACE$", container!="", container!="POD", pod!=""})[$MEASUREMENT_DURATION_IN_MIN$m:])'
+ query: 'min by (Hostname,device,modelName,UUID,exported_container,exported_namespace) (min_over_time(DCGM_FI_DEV_GPU_UTIL{exported_namespace="$NAMESPACE$",exported_container="$CONTAINER_NAME$"}[$MEASUREMENT_DURATION_IN_MIN$m])'
+ # GPU Memory usage
+ - name: gpuMemoryUsage
+ datasource: prometheus
+ value_type: "double"
+ kubernetes_object: "container"
- # Show total pods in a namespace
- - name: namespaceTotalPods
- datasource: prometheus
- value_type: "double"
- kubernetes_object: "namespace"
- aggregation_functions:
- # maximum total pods in a namespace
- - function: max
- query: 'max_over_time(sum by(namespace) ((kube_pod_info{namespace="$NAMESPACE$"}))[$MEASUREMENT_DURATION_IN_MIN$m:])'
- # average total pods in a namespace
+ aggregation_functions:
+ # Average GPU Memory Usage Percentage per container in a deployment
- function: avg
- query: 'avg_over_time(sum by(namespace) ((kube_pod_info{namespace="$NAMESPACE$"}))[$MEASUREMENT_DURATION_IN_MIN$m:])'
-
+ query: 'avg by (Hostname,device,modelName,UUID,exported_container,exported_namespace) (avg_over_time(DCGM_FI_DEV_MEM_COPY_UTIL{exported_namespace="$NAMESPACE$",exported_container="$CONTAINER_NAME$"}[$MEASUREMENT_DURATION_IN_MIN$m])'
- # Show total running pods in a namespace
- - name: namespaceRunningPods
- datasource: prometheus
- value_type: "double"
- kubernetes_object: "namespace"
- aggregation_functions:
- # maximum total pods in a namespace
- - function: max
- query: 'max_over_time(sum by(namespace) ((kube_pod_status_phase{phase="Running"}))[$MEASUREMENT_DURATION_IN_MIN$m:])'
- # average total pods in a namespace
- - function: avg
- query: 'avg_over_time(sum by(namespace) ((kube_pod_status_phase{phase="Running"}))[$MEASUREMENT_DURATION_IN_MIN$m:])'
-
- # Show last activity for a namespace
- - name: namespaceMaxDate
- datasource: prometheus
- value_type: "double"
- kubernetes_object: "namespace"
- aggregation_functions:
+ # Maximum GPU Memory Usage Percentage per container in a deployment
- function: max
- query: 'max(last_over_time(timestamp((sum by (namespace) (container_cpu_usage_seconds_total{namespace="$NAMESPACE$"})) > 0 )[15d:]))'
+ query: 'max by (Hostname,device,modelName,UUID,exported_container,exported_namespace) (max_over_time(DCGM_FI_DEV_MEM_COPY_UTIL{exported_namespace="$NAMESPACE$",exported_container="$CONTAINER_NAME$"}[$MEASUREMENT_DURATION_IN_MIN$m])'
+
+ # Minimum of GPU Memory Usage Percentage for a container in a deployment
+ - function: min
+ query: 'min by (Hostname,device,modelName,UUID,exported_container,exported_namespace) (min_over_time(DCGM_FI_DEV_MEM_COPY_UTIL{exported_namespace="$NAMESPACE$",exported_container="$CONTAINER_NAME$"}[$MEASUREMENT_DURATION_IN_MIN$m])'
From fb92dbb8941f62ed6e53d60fa928104536eee681 Mon Sep 17 00:00:00 2001
From: msvinaykumar
Date: Fri, 4 Oct 2024 13:58:56 +0530
Subject: [PATCH 06/64] bulk documentation
Signed-off-by: msvinaykumar
---
design/BulkAPI.md | 156 ++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 156 insertions(+)
create mode 100644 design/BulkAPI.md
diff --git a/design/BulkAPI.md b/design/BulkAPI.md
new file mode 100644
index 000000000..095149ce9
--- /dev/null
+++ b/design/BulkAPI.md
@@ -0,0 +1,156 @@
+# Bulk API Documentation
+
+Bulk is an API designed to provide resource optimization recommendations in bulk for all available
+containers, namespaces, etc., for a cluster connected via the datasource integration framework. Bulk can
+be configured using filters like exclude/include namespaces, workloads, containers, or labels for generating
+recommendations. It also has settings to generate recommendations at both the container or namespace level, or both.
+
+Bulk returns a `jobID` as a response to track the job status. The user can use the `jobID` to monitor the
+progress of the job.
+
+## Task Flow When Bulk Is Invoked
+
+1. Returns a unique `jobID`.
+2. Background Bulk:
+ - First, does a handshake with the datasource.
+ - Using queries, it fetches the list of namespaces, workloads, containers of the connected datasource.
+ - Creates experiments, one for each container *alpha release.
+ - Triggers `generateRecommendations` for each container.
+ - Once all experiments are created, and recommendations are generated, the system marks the `jobID` as "COMPLETED".
+
+## API Specification
+
+### POST /bulk
+
+**Request Payload (JSON):**
+
+```json
+{
+ "filter": {
+ "exclude": {
+ "namespace": [],
+ "workload": [],
+ "containers": [],
+ "labels": {}
+ },
+ "include": {
+ "namespace": [],
+ "workload": [],
+ "containers": [],
+ "labels": {
+ "key1": "value1",
+ "key2": "value2"
+ }
+ }
+ },
+ "time_range": {},
+ "datasource": "Cbank1Xyz",
+ "experiment_types": [
+ "container",
+ "namespace"
+ ]
+}
+```
+
+**filter:** This object contains both exclusion and inclusion filters to specify the scope of data being queried.
+
+- **exclude:** Defines the criteria to exclude certain data.
+ - **namespace:** A list of Kubernetes namespaces to exclude. If empty, no namespaces are excluded.
+ - **workload:** A list of workloads to exclude.
+ - **containers:** A list of container names to exclude.
+ - **labels:** Key-value pairs of labels to exclude.
+
+- **include:** Defines the criteria to include specific data.
+ - **namespace:** A list of Kubernetes namespaces to include.
+ - **workload:** A list of workloads to include.
+ - **containers:** A list of container names to include.
+ - **labels:** Key-value pairs of labels to include.
+
+- **time_range:** Specifies the time range for querying the data. If empty, no specific time range is applied.
+
+- **datasource:** The data source, e.g., `"Cbank1Xyz"`.
+
+- **experiment_types:** Specifies the type(s) of experiments to run, e.g., `"container"` or `"namespace"`.
+
+### Success Response
+
+- **Status:** 200 OK
+- **Body:**
+
+```json
+{
+ "jobid": "123e4567-e89b-12d3-a456-426614174000"
+}
+```
+
+### GET Request:
+
+```bash
+GET /bulk?jobid=123e4567-e89b-12d3-a456-426614174000
+```
+
+**Body (JSON):**
+
+```json
+{
+ "jobID": "123e4567-e89b-12d3-a456-426614174000",
+ "status": "IN-PROGRESS",
+ "progress": 30,
+ "data": {
+ "experiments": {
+ "new": [
+ "a",
+ "b",
+ "c"
+ ],
+ "updated": [],
+ "failed": []
+ },
+ "recommendations": {
+ "count": 9,
+ "completed": 3,
+ "experiments": {
+ "completed": [
+ "exp1",
+ "exp2",
+ "exp3"
+ ],
+ "progress": [
+ "exp1",
+ "exp2",
+ "exp3"
+ ],
+ "new": [
+ "exp1",
+ "exp2",
+ "exp3"
+ ],
+ "failed": []
+ }
+ }
+ },
+ "job_start_time": "2024-09-23T10:58:47.048Z",
+ "job_end_time": "2024-09-23T11:01:52.205Z"
+}
+```
+
+### Response Parameters
+
+- **jobID:** Unique identifier for the job.
+- **status:** Current status of the job. Possible values: `"IN-PROGRESS"`, `"COMPLETED"`, `"FAILED"`.
+- **progress:** Percentage of job completion.
+- **data:** Contains detailed information about the experiments and recommendations.
+ - **experiments:** Tracks the status of experiments.
+ - **new:** List of newly created experiments.
+ - **updated:** List of updated experiments.
+ - **failed:** List of experiments that failed.
+ - **recommendations:** Provides details on recommendations.
+ - **count:** Total number of recommendations.
+ - **completed:** Number of completed recommendations.
+ - **experiments:**
+ - **completed:** List of experiments with completed recommendations.
+ - **progress:** List of experiments in progress.
+ - **new:** List of new experiments.
+ - **failed:** List of failed experiments.
+- **job_start_time:** Timestamp indicating when the job started.
+- **job_end_time:** Timestamp indicating when the job finished.
From 424f678f76b52b9f58306aa7b2d899cd5cb723ca Mon Sep 17 00:00:00 2001
From: bharathappali
Date: Thu, 3 Oct 2024 12:37:07 +0530
Subject: [PATCH 07/64] Add Device level interfaces
Signed-off-by: bharathappali
---
.../com/autotune/analyzer/utils/AnalyzerConstants.java | 7 +++++++
.../common/data/system/info/device/DeviceDetails.java | 7 +++++++
.../common/data/system/info/device/DeviceHandler.java | 9 +++++++++
3 files changed, 23 insertions(+)
create mode 100644 src/main/java/com/autotune/common/data/system/info/device/DeviceDetails.java
create mode 100644 src/main/java/com/autotune/common/data/system/info/device/DeviceHandler.java
diff --git a/src/main/java/com/autotune/analyzer/utils/AnalyzerConstants.java b/src/main/java/com/autotune/analyzer/utils/AnalyzerConstants.java
index 4d6b1460a..dcbced063 100644
--- a/src/main/java/com/autotune/analyzer/utils/AnalyzerConstants.java
+++ b/src/main/java/com/autotune/analyzer/utils/AnalyzerConstants.java
@@ -196,6 +196,13 @@ public enum RegisterRecommendationModelStatus {
INVALID
}
+ public enum DeviceType {
+ CPU,
+ MEMORY,
+ NETWORK,
+ GPU
+ }
+
public static final class ExperimentTypes {
public static final String NAMESPACE_EXPERIMENT = "namespace";
public static final String CONTAINER_EXPERIMENT = "container";
diff --git a/src/main/java/com/autotune/common/data/system/info/device/DeviceDetails.java b/src/main/java/com/autotune/common/data/system/info/device/DeviceDetails.java
new file mode 100644
index 000000000..584891b60
--- /dev/null
+++ b/src/main/java/com/autotune/common/data/system/info/device/DeviceDetails.java
@@ -0,0 +1,7 @@
+package com.autotune.common.data.system.info.device;
+
+import com.autotune.analyzer.utils.AnalyzerConstants;
+
+public interface DeviceDetails {
+ public AnalyzerConstants.DeviceType getType();
+}
diff --git a/src/main/java/com/autotune/common/data/system/info/device/DeviceHandler.java b/src/main/java/com/autotune/common/data/system/info/device/DeviceHandler.java
new file mode 100644
index 000000000..55e6df002
--- /dev/null
+++ b/src/main/java/com/autotune/common/data/system/info/device/DeviceHandler.java
@@ -0,0 +1,9 @@
+package com.autotune.common.data.system.info.device;
+
+import com.autotune.analyzer.utils.AnalyzerConstants;
+
+public interface DeviceHandler {
+ public void addDevice(AnalyzerConstants.DeviceType deviceType, DeviceDetails deviceInfo);
+ public void removeDevice(AnalyzerConstants.DeviceType deviceType, DeviceDetails deviceInfo);
+ public void updateDevice(AnalyzerConstants.DeviceType deviceType, DeviceDetails deviceInfo);
+}
From 08d9a805194dcbcd5794a3b6eb9bc9fdad7af99d Mon Sep 17 00:00:00 2001
From: bharathappali
Date: Thu, 3 Oct 2024 12:57:08 +0530
Subject: [PATCH 08/64] Add Accelerator specific interfaces and there
respective implementations
Signed-off-by: bharathappali
---
.../analyzer/utils/AnalyzerConstants.java | 17 ++-
.../info/device/ContainerDeviceList.java | 130 ++++++++++++++++++
.../info/device/DeviceComponentDetector.java | 8 ++
.../system/info/device/DeviceHandler.java | 6 +
.../accelerator/AcceleratorDeviceData.java | 59 ++++++++
.../accelerator/AcceleratorDeviceDetails.java | 11 ++
.../metadata/AcceleratorMetaDataService.java | 78 +++++++++++
.../metadata/AcceleratorProfile.java | 41 ++++++
8 files changed, 349 insertions(+), 1 deletion(-)
create mode 100644 src/main/java/com/autotune/common/data/system/info/device/ContainerDeviceList.java
create mode 100644 src/main/java/com/autotune/common/data/system/info/device/DeviceComponentDetector.java
create mode 100644 src/main/java/com/autotune/common/data/system/info/device/accelerator/AcceleratorDeviceData.java
create mode 100644 src/main/java/com/autotune/common/data/system/info/device/accelerator/AcceleratorDeviceDetails.java
create mode 100644 src/main/java/com/autotune/common/data/system/info/device/accelerator/metadata/AcceleratorMetaDataService.java
create mode 100644 src/main/java/com/autotune/common/data/system/info/device/accelerator/metadata/AcceleratorProfile.java
diff --git a/src/main/java/com/autotune/analyzer/utils/AnalyzerConstants.java b/src/main/java/com/autotune/analyzer/utils/AnalyzerConstants.java
index dcbced063..d4eb14646 100644
--- a/src/main/java/com/autotune/analyzer/utils/AnalyzerConstants.java
+++ b/src/main/java/com/autotune/analyzer/utils/AnalyzerConstants.java
@@ -200,7 +200,22 @@ public enum DeviceType {
CPU,
MEMORY,
NETWORK,
- GPU
+ ACCELERATOR
+ }
+
+ public enum DeviceParameters {
+ MODEL_NAME,
+ UUID,
+ HOSTNAME,
+ NAME,
+ MANUFACTURER,
+ DEVICE_NAME
+ }
+
+ public static final class SupportedGPUs {
+ public static final String A100_80_GB = "A100-80GB";
+ public static final String A100_40_GB = "A100-40GB";
+ public static final String H100 = "H100";
}
public static final class ExperimentTypes {
diff --git a/src/main/java/com/autotune/common/data/system/info/device/ContainerDeviceList.java b/src/main/java/com/autotune/common/data/system/info/device/ContainerDeviceList.java
new file mode 100644
index 000000000..e2c484549
--- /dev/null
+++ b/src/main/java/com/autotune/common/data/system/info/device/ContainerDeviceList.java
@@ -0,0 +1,130 @@
+package com.autotune.common.data.system.info.device;
+
+import com.autotune.analyzer.utils.AnalyzerConstants;
+import com.autotune.common.data.system.info.device.accelerator.AcceleratorDeviceData;
+
+import java.util.ArrayList;
+import java.util.HashMap;
+
+public class ContainerDeviceList implements DeviceHandler, DeviceComponentDetector {
+ private final HashMap> deviceMap;
+ private boolean isAcceleratorDeviceDetected;
+ private boolean isCPUDeviceDetected;
+ private boolean isMemoryDeviceDetected;
+ private boolean isNetworkDeviceDetected;
+
+ public ContainerDeviceList(){
+ this.deviceMap = new HashMap>();
+ this.isAcceleratorDeviceDetected = false;
+ // Currently setting up CPU, Memory and Network as true by default
+ this.isCPUDeviceDetected = true;
+ this.isMemoryDeviceDetected = true;
+ this.isNetworkDeviceDetected = true;
+ }
+
+ @Override
+ public void addDevice(AnalyzerConstants.DeviceType deviceType, DeviceDetails deviceInfo) {
+ if (null == deviceType || null == deviceInfo) {
+ // TODO: Handle appropriate returns in future
+ return;
+ }
+
+ if (deviceType == AnalyzerConstants.DeviceType.ACCELERATOR)
+ this.isAcceleratorDeviceDetected = true;
+
+ // TODO: Handle multiple same entries
+ // Currently only first MIG is getting added so no check for existing duplicates is done
+ if (null == deviceMap.get(deviceType)) {
+ ArrayList deviceDetailsList = new ArrayList();
+ deviceDetailsList.add(deviceInfo);
+ this.deviceMap.put(deviceType, deviceDetailsList);
+ } else {
+ this.deviceMap.get(deviceType).add(deviceInfo);
+ }
+ }
+
+ @Override
+ public void removeDevice(AnalyzerConstants.DeviceType deviceType, DeviceDetails deviceInfo) {
+ if (null == deviceType || null == deviceInfo) {
+ // TODO: Handle appropriate returns in future
+ return;
+ }
+ // TODO: Need to be implemented if we need a dynamic experiment device updates
+ if (deviceType == AnalyzerConstants.DeviceType.ACCELERATOR) {
+ if (null == deviceMap.get(deviceType) || this.deviceMap.get(deviceType).isEmpty()) {
+ this.isAcceleratorDeviceDetected = false;
+ }
+ }
+ }
+
+ @Override
+ public void updateDevice(AnalyzerConstants.DeviceType deviceType, DeviceDetails deviceInfo) {
+ // TODO: Need to be implemented if we need a dynamic experiment device updates
+ }
+
+ @Override
+ public DeviceDetails getDeviceByParameter(AnalyzerConstants.DeviceType deviceType, String matchIdentifier, AnalyzerConstants.DeviceParameters deviceParameters) {
+ if (null == deviceType)
+ return null;
+ if (null == matchIdentifier)
+ return null;
+ if (null == deviceParameters)
+ return null;
+ if (matchIdentifier.isEmpty())
+ return null;
+ if (!deviceMap.containsKey(deviceType))
+ return null;
+ if (null == deviceMap.get(deviceType))
+ return null;
+ if (deviceMap.get(deviceType).isEmpty())
+ return null;
+
+ // Todo: Need to add extractors for each device type currently implementing for GPU
+ if (deviceType == AnalyzerConstants.DeviceType.ACCELERATOR) {
+ for (DeviceDetails deviceDetails: deviceMap.get(deviceType)) {
+ AcceleratorDeviceData deviceData = (AcceleratorDeviceData) deviceDetails;
+ if (deviceParameters == AnalyzerConstants.DeviceParameters.MODEL_NAME) {
+ if (deviceData.getModelName().equalsIgnoreCase(matchIdentifier)) {
+ return deviceData;
+ }
+ }
+ }
+ }
+
+ return null;
+ }
+
+ @Override
+ public ArrayList getDevices(AnalyzerConstants.DeviceType deviceType) {
+ if (null == deviceType)
+ return null;
+ if (!deviceMap.containsKey(deviceType))
+ return null;
+ if (null == deviceMap.get(deviceType))
+ return null;
+ if (deviceMap.get(deviceType).isEmpty())
+ return null;
+
+ return deviceMap.get(deviceType);
+ }
+
+ @Override
+ public boolean isAcceleratorDeviceDetected() {
+ return this.isAcceleratorDeviceDetected;
+ }
+
+ @Override
+ public boolean isCPUDeviceDetected() {
+ return this.isCPUDeviceDetected;
+ }
+
+ @Override
+ public boolean isMemoryDeviceDetected() {
+ return this.isMemoryDeviceDetected;
+ }
+
+ @Override
+ public boolean isNetworkDeviceDetected() {
+ return this.isNetworkDeviceDetected;
+ }
+}
diff --git a/src/main/java/com/autotune/common/data/system/info/device/DeviceComponentDetector.java b/src/main/java/com/autotune/common/data/system/info/device/DeviceComponentDetector.java
new file mode 100644
index 000000000..249ba9c55
--- /dev/null
+++ b/src/main/java/com/autotune/common/data/system/info/device/DeviceComponentDetector.java
@@ -0,0 +1,8 @@
+package com.autotune.common.data.system.info.device;
+
+public interface DeviceComponentDetector {
+ public boolean isAcceleratorDeviceDetected();
+ public boolean isCPUDeviceDetected();
+ public boolean isMemoryDeviceDetected();
+ public boolean isNetworkDeviceDetected();
+}
diff --git a/src/main/java/com/autotune/common/data/system/info/device/DeviceHandler.java b/src/main/java/com/autotune/common/data/system/info/device/DeviceHandler.java
index 55e6df002..447716440 100644
--- a/src/main/java/com/autotune/common/data/system/info/device/DeviceHandler.java
+++ b/src/main/java/com/autotune/common/data/system/info/device/DeviceHandler.java
@@ -2,8 +2,14 @@
import com.autotune.analyzer.utils.AnalyzerConstants;
+import java.util.ArrayList;
+
public interface DeviceHandler {
public void addDevice(AnalyzerConstants.DeviceType deviceType, DeviceDetails deviceInfo);
public void removeDevice(AnalyzerConstants.DeviceType deviceType, DeviceDetails deviceInfo);
public void updateDevice(AnalyzerConstants.DeviceType deviceType, DeviceDetails deviceInfo);
+ public DeviceDetails getDeviceByParameter(AnalyzerConstants.DeviceType deviceType,
+ String matchIdentifier,
+ AnalyzerConstants.DeviceParameters deviceParameters);
+ public ArrayList getDevices(AnalyzerConstants.DeviceType deviceType);
}
diff --git a/src/main/java/com/autotune/common/data/system/info/device/accelerator/AcceleratorDeviceData.java b/src/main/java/com/autotune/common/data/system/info/device/accelerator/AcceleratorDeviceData.java
new file mode 100644
index 000000000..a3a09fead
--- /dev/null
+++ b/src/main/java/com/autotune/common/data/system/info/device/accelerator/AcceleratorDeviceData.java
@@ -0,0 +1,59 @@
+package com.autotune.common.data.system.info.device.accelerator;
+
+import com.autotune.analyzer.utils.AnalyzerConstants;
+
+public class AcceleratorDeviceData implements AcceleratorDeviceDetails {
+ private final String manufacturer;
+ private final String modelName;
+ private final String hostName;
+ private final String UUID;
+ private final String deviceName;
+ private boolean isMIG;
+
+ public AcceleratorDeviceData (String modelName, String hostName, String UUID, String deviceName, boolean isMIG) {
+ this.manufacturer = "NVIDIA";
+ this.modelName = modelName;
+ this.hostName = hostName;
+ this.UUID = UUID;
+ this.deviceName = deviceName;
+ this.isMIG = isMIG;
+ }
+
+ @Override
+ public String getManufacturer() {
+ return this.manufacturer;
+ }
+
+ @Override
+ public String getModelName() {
+ return modelName;
+ }
+
+ @Override
+ public String getHostName() {
+ return hostName;
+ }
+
+ @Override
+ public String getUUID() {
+ return UUID;
+ }
+
+ @Override
+ public String getDeviceName() {
+ return deviceName;
+ }
+
+ public boolean isMIG() {
+ return isMIG;
+ }
+
+ public void setMIG(boolean isMIG) {
+ this.isMIG = isMIG;
+ }
+
+ @Override
+ public AnalyzerConstants.DeviceType getType() {
+ return AnalyzerConstants.DeviceType.ACCELERATOR;
+ }
+}
diff --git a/src/main/java/com/autotune/common/data/system/info/device/accelerator/AcceleratorDeviceDetails.java b/src/main/java/com/autotune/common/data/system/info/device/accelerator/AcceleratorDeviceDetails.java
new file mode 100644
index 000000000..31b90ff66
--- /dev/null
+++ b/src/main/java/com/autotune/common/data/system/info/device/accelerator/AcceleratorDeviceDetails.java
@@ -0,0 +1,11 @@
+package com.autotune.common.data.system.info.device.accelerator;
+
+import com.autotune.common.data.system.info.device.DeviceDetails;
+
+public interface AcceleratorDeviceDetails extends DeviceDetails {
+ public String getManufacturer();
+ public String getModelName();
+ public String getHostName();
+ public String getUUID();
+ public String getDeviceName();
+}
diff --git a/src/main/java/com/autotune/common/data/system/info/device/accelerator/metadata/AcceleratorMetaDataService.java b/src/main/java/com/autotune/common/data/system/info/device/accelerator/metadata/AcceleratorMetaDataService.java
new file mode 100644
index 000000000..a225c0757
--- /dev/null
+++ b/src/main/java/com/autotune/common/data/system/info/device/accelerator/metadata/AcceleratorMetaDataService.java
@@ -0,0 +1,78 @@
+package com.autotune.common.data.system.info.device.accelerator.metadata;
+
+
+
+import com.autotune.analyzer.utils.AnalyzerConstants;
+
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+public class AcceleratorMetaDataService {
+ private static Map> acceleratorProfilesMap;
+ private static AcceleratorMetaDataService acceleratorMetaDataService = null;
+
+ private AcceleratorMetaDataService(){
+ acceleratorProfilesMap = new HashMap<>();
+ initializeAcceleratorProfiles();
+ }
+
+ private static void initializeAcceleratorProfiles() {
+ List commonProfiles = new ArrayList<>();
+ // IMPORTANT: Add it in the ascending order according to GPU Core and Memory Units as we will break the loop upon getting the right one
+ commonProfiles.add(new AcceleratorProfile("1g.10gb", 1.0 / 8, 1.0 / 7, 7));
+ commonProfiles.add(new AcceleratorProfile("1g.20gb", 1.0 / 4, 1.0 / 7, 4));
+ commonProfiles.add(new AcceleratorProfile("2g.20gb", 2.0 / 8, 2.0 / 7, 3));
+ commonProfiles.add(new AcceleratorProfile("3g.40gb", 4.0 / 8, 3.0 / 7, 2));
+ commonProfiles.add(new AcceleratorProfile("4g.40gb", 4.0 / 8, 4.0 / 7, 1));
+ commonProfiles.add(new AcceleratorProfile("7g.80gb", 1.0, 1.0, 1));
+
+ List a100_40_gb_profiles = new ArrayList<>();
+ // IMPORTANT: Add it in the ascending order according to GPU Core and Memory Units as we will break the loop upon getting the right one
+ a100_40_gb_profiles.add(new AcceleratorProfile("1g.5gb", 1.0 / 8, 1.0 / 7, 7));
+ a100_40_gb_profiles.add(new AcceleratorProfile("1g.10gb", 1.0 / 4, 1.0 / 7, 4));
+ a100_40_gb_profiles.add(new AcceleratorProfile("2g.10gb", 2.0 / 8, 2.0 / 7, 3));
+ a100_40_gb_profiles.add(new AcceleratorProfile("3g.20gb", 4.0 / 8, 3.0 / 7, 2));
+ a100_40_gb_profiles.add(new AcceleratorProfile("4g.20gb", 4.0 / 8, 4.0 / 7, 1));
+ a100_40_gb_profiles.add(new AcceleratorProfile("7g.40gb", 1.0, 1.0, 1));
+
+ acceleratorProfilesMap.put(AnalyzerConstants.SupportedGPUs.A100_80_GB, new ArrayList<>(commonProfiles));
+ acceleratorProfilesMap.put(AnalyzerConstants.SupportedGPUs.H100, new ArrayList<>(commonProfiles));
+ acceleratorProfilesMap.put(AnalyzerConstants.SupportedGPUs.A100_40_GB, new ArrayList<>(a100_40_gb_profiles));
+ }
+
+ public static AcceleratorMetaDataService getInstance() {
+ if(null == acceleratorMetaDataService) {
+ synchronized (AcceleratorMetaDataService.class) {
+ if (null == acceleratorMetaDataService) {
+ acceleratorMetaDataService = new AcceleratorMetaDataService();
+ }
+ }
+ }
+ return acceleratorMetaDataService;
+ }
+
+ public AcceleratorProfile getAcceleratorProfile(String modelName, Double requiredSmFraction, Double requiredMemoryFraction) {
+ if (null == modelName || null == requiredSmFraction || null == requiredMemoryFraction) {
+ return null;
+ }
+ modelName = modelName.strip();
+ if (!modelName.equalsIgnoreCase(AnalyzerConstants.SupportedGPUs.A100_80_GB)
+ && !modelName.equalsIgnoreCase(AnalyzerConstants.SupportedGPUs.H100)
+ && !modelName.equalsIgnoreCase(AnalyzerConstants.SupportedGPUs.A100_40_GB)) {
+ return null;
+ }
+ if (requiredMemoryFraction < 0.0 || requiredSmFraction < 0.0) {
+ return null;
+ }
+ List gpuProfiles = acceleratorProfilesMap.get(modelName);
+ for (AcceleratorProfile profile : gpuProfiles) {
+ if (profile.getMemoryFraction() >= requiredMemoryFraction && profile.getSmFraction() >= requiredSmFraction) {
+ // Returning the profile as the list is in ascending order
+ return profile;
+ }
+ }
+ return null;
+ }
+}
diff --git a/src/main/java/com/autotune/common/data/system/info/device/accelerator/metadata/AcceleratorProfile.java b/src/main/java/com/autotune/common/data/system/info/device/accelerator/metadata/AcceleratorProfile.java
new file mode 100644
index 000000000..024d78e63
--- /dev/null
+++ b/src/main/java/com/autotune/common/data/system/info/device/accelerator/metadata/AcceleratorProfile.java
@@ -0,0 +1,41 @@
+package com.autotune.common.data.system.info.device.accelerator.metadata;
+
+public class AcceleratorProfile {
+ private final String profileName;
+ private final double memoryFraction;
+ private final double smFraction;
+ private final int instancesAvailable;
+
+ public AcceleratorProfile(String profileName, double memoryFraction, double smFraction, int instancesAvailable) {
+ this.profileName = profileName;
+ this.memoryFraction = memoryFraction;
+ this.smFraction = smFraction;
+ this.instancesAvailable = instancesAvailable;
+ }
+
+ public String getProfileName() {
+ return profileName;
+ }
+
+ public double getMemoryFraction() {
+ return memoryFraction;
+ }
+
+ public double getSmFraction() {
+ return smFraction;
+ }
+
+ public int getInstancesAvailable() {
+ return instancesAvailable;
+ }
+
+ @Override
+ public String toString() {
+ return "AcceleratorProfile{" +
+ "profileName='" + profileName + '\'' +
+ ", memoryFraction=" + memoryFraction +
+ ", smFraction=" + smFraction +
+ ", instancesAvailable=" + instancesAvailable +
+ '}';
+ }
+}
From 71d0d4185df32b74c3f9153e7ce4a3202ef27947 Mon Sep 17 00:00:00 2001
From: bharathappali
Date: Thu, 3 Oct 2024 16:29:46 +0530
Subject: [PATCH 09/64] Add Javadoc and create constants
Signed-off-by: bharathappali
---
.../analyzer/utils/AnalyzerConstants.java | 38 ++++++++++--
.../info/device/ContainerDeviceList.java | 14 +++++
.../metadata/AcceleratorMetaDataService.java | 61 +++++++++++++------
.../metadata/AcceleratorProfile.java | 12 +++-
4 files changed, 102 insertions(+), 23 deletions(-)
diff --git a/src/main/java/com/autotune/analyzer/utils/AnalyzerConstants.java b/src/main/java/com/autotune/analyzer/utils/AnalyzerConstants.java
index d4eb14646..13dd02511 100644
--- a/src/main/java/com/autotune/analyzer/utils/AnalyzerConstants.java
+++ b/src/main/java/com/autotune/analyzer/utils/AnalyzerConstants.java
@@ -212,10 +212,40 @@ public enum DeviceParameters {
DEVICE_NAME
}
- public static final class SupportedGPUs {
- public static final String A100_80_GB = "A100-80GB";
- public static final String A100_40_GB = "A100-40GB";
- public static final String H100 = "H100";
+ public static final class AcceleratorConstants {
+ private AcceleratorConstants() {
+
+ }
+
+ public static final class SupportedAccelerators {
+ private SupportedAccelerators() {
+
+ }
+ public static final String A100_80_GB = "A100-80GB";
+ public static final String A100_40_GB = "A100-40GB";
+ public static final String H100 = "H100";
+ }
+
+ public static final class AcceleratorProfiles {
+ private AcceleratorProfiles () {
+
+ }
+
+ // A100 40GB Profiles
+ public static final String PROFILE_1G_5GB = "1g.5gb";
+ public static final String PROFILE_1G_10GB = "1g.10gb";
+ public static final String PROFILE_2G_10GB = "2g.10gb";
+ public static final String PROFILE_3G_20GB = "3g.20gb";
+ public static final String PROFILE_4G_20GB = "4g.20gb";
+ public static final String PROFILE_7G_40GB = "7g.40gb";
+
+ // A100 80GB & H100 80GB Profiles
+ public static final String PROFILE_1G_20GB = "1g.20gb";
+ public static final String PROFILE_2G_20GB = "2g.20gb";
+ public static final String PROFILE_3G_40GB = "3g.40gb";
+ public static final String PROFILE_4G_40GB = "4g.40gb";
+ public static final String PROFILE_7G_80GB = "7g.80gb";
+ }
}
public static final class ExperimentTypes {
diff --git a/src/main/java/com/autotune/common/data/system/info/device/ContainerDeviceList.java b/src/main/java/com/autotune/common/data/system/info/device/ContainerDeviceList.java
index e2c484549..00de9e322 100644
--- a/src/main/java/com/autotune/common/data/system/info/device/ContainerDeviceList.java
+++ b/src/main/java/com/autotune/common/data/system/info/device/ContainerDeviceList.java
@@ -6,6 +6,9 @@
import java.util.ArrayList;
import java.util.HashMap;
+/**
+ * This class stores the device entries linked to the container
+ */
public class ContainerDeviceList implements DeviceHandler, DeviceComponentDetector {
private final HashMap> deviceMap;
private boolean isAcceleratorDeviceDetected;
@@ -62,6 +65,17 @@ public void updateDevice(AnalyzerConstants.DeviceType deviceType, DeviceDetails
// TODO: Need to be implemented if we need a dynamic experiment device updates
}
+ /**
+ * Returns the Device which matches the identifier based on the device parameter passed
+ * @param deviceType - Type of the device Eg: CPU, Memory, Network or Accelerator
+ * @param matchIdentifier - String which needs to the matched
+ * @param deviceParameters - Parameter to search in device details list
+ * @return the appropriate DeviceDetails object
+ *
+ * USE CASE: To search the device based on a particular parameter, Let's say you have multiple accelerators
+ * to the container, you can pass the Model name as parameter and name of model to get the particular
+ * DeviceDetail object.
+ */
@Override
public DeviceDetails getDeviceByParameter(AnalyzerConstants.DeviceType deviceType, String matchIdentifier, AnalyzerConstants.DeviceParameters deviceParameters) {
if (null == deviceType)
diff --git a/src/main/java/com/autotune/common/data/system/info/device/accelerator/metadata/AcceleratorMetaDataService.java b/src/main/java/com/autotune/common/data/system/info/device/accelerator/metadata/AcceleratorMetaDataService.java
index a225c0757..58d43d686 100644
--- a/src/main/java/com/autotune/common/data/system/info/device/accelerator/metadata/AcceleratorMetaDataService.java
+++ b/src/main/java/com/autotune/common/data/system/info/device/accelerator/metadata/AcceleratorMetaDataService.java
@@ -9,10 +9,23 @@
import java.util.List;
import java.util.Map;
+/**
+ * A service which is created to provide the respective Accelerator Profile
+ * based on SM and Memory requirements
+ *
+ * This service initially loads the profiles of supported Accelerators
+ * Currently it supports:
+ * NVIDIA A100 40GB
+ * NVIDIA A100 80GB
+ * NVIDIA H100 80GB
+ */
public class AcceleratorMetaDataService {
private static Map> acceleratorProfilesMap;
private static AcceleratorMetaDataService acceleratorMetaDataService = null;
+ /**
+ *
+ */
private AcceleratorMetaDataService(){
acceleratorProfilesMap = new HashMap<>();
initializeAcceleratorProfiles();
@@ -21,25 +34,37 @@ private AcceleratorMetaDataService(){
private static void initializeAcceleratorProfiles() {
List commonProfiles = new ArrayList<>();
// IMPORTANT: Add it in the ascending order according to GPU Core and Memory Units as we will break the loop upon getting the right one
- commonProfiles.add(new AcceleratorProfile("1g.10gb", 1.0 / 8, 1.0 / 7, 7));
- commonProfiles.add(new AcceleratorProfile("1g.20gb", 1.0 / 4, 1.0 / 7, 4));
- commonProfiles.add(new AcceleratorProfile("2g.20gb", 2.0 / 8, 2.0 / 7, 3));
- commonProfiles.add(new AcceleratorProfile("3g.40gb", 4.0 / 8, 3.0 / 7, 2));
- commonProfiles.add(new AcceleratorProfile("4g.40gb", 4.0 / 8, 4.0 / 7, 1));
- commonProfiles.add(new AcceleratorProfile("7g.80gb", 1.0, 1.0, 1));
+ commonProfiles.add(new AcceleratorProfile(AnalyzerConstants.AcceleratorConstants.AcceleratorProfiles.PROFILE_1G_10GB,
+ 1.0 / 8, 1.0 / 7, 7));
+ commonProfiles.add(new AcceleratorProfile(AnalyzerConstants.AcceleratorConstants.AcceleratorProfiles.PROFILE_1G_20GB,
+ 1.0 / 4, 1.0 / 7, 4));
+ commonProfiles.add(new AcceleratorProfile(AnalyzerConstants.AcceleratorConstants.AcceleratorProfiles.PROFILE_2G_20GB,
+ 2.0 / 8, 2.0 / 7, 3));
+ commonProfiles.add(new AcceleratorProfile(AnalyzerConstants.AcceleratorConstants.AcceleratorProfiles.PROFILE_3G_40GB,
+ 4.0 / 8, 3.0 / 7, 2));
+ commonProfiles.add(new AcceleratorProfile(AnalyzerConstants.AcceleratorConstants.AcceleratorProfiles.PROFILE_4G_40GB,
+ 4.0 / 8, 4.0 / 7, 1));
+ commonProfiles.add(new AcceleratorProfile(AnalyzerConstants.AcceleratorConstants.AcceleratorProfiles.PROFILE_7G_80GB,
+ 1.0, 1.0, 1));
List a100_40_gb_profiles = new ArrayList<>();
// IMPORTANT: Add it in the ascending order according to GPU Core and Memory Units as we will break the loop upon getting the right one
- a100_40_gb_profiles.add(new AcceleratorProfile("1g.5gb", 1.0 / 8, 1.0 / 7, 7));
- a100_40_gb_profiles.add(new AcceleratorProfile("1g.10gb", 1.0 / 4, 1.0 / 7, 4));
- a100_40_gb_profiles.add(new AcceleratorProfile("2g.10gb", 2.0 / 8, 2.0 / 7, 3));
- a100_40_gb_profiles.add(new AcceleratorProfile("3g.20gb", 4.0 / 8, 3.0 / 7, 2));
- a100_40_gb_profiles.add(new AcceleratorProfile("4g.20gb", 4.0 / 8, 4.0 / 7, 1));
- a100_40_gb_profiles.add(new AcceleratorProfile("7g.40gb", 1.0, 1.0, 1));
+ a100_40_gb_profiles.add(new AcceleratorProfile(AnalyzerConstants.AcceleratorConstants.AcceleratorProfiles.PROFILE_1G_5GB,
+ 1.0 / 8, 1.0 / 7, 7));
+ a100_40_gb_profiles.add(new AcceleratorProfile(AnalyzerConstants.AcceleratorConstants.AcceleratorProfiles.PROFILE_1G_10GB,
+ 1.0 / 4, 1.0 / 7, 4));
+ a100_40_gb_profiles.add(new AcceleratorProfile(AnalyzerConstants.AcceleratorConstants.AcceleratorProfiles.PROFILE_2G_10GB,
+ 2.0 / 8, 2.0 / 7, 3));
+ a100_40_gb_profiles.add(new AcceleratorProfile(AnalyzerConstants.AcceleratorConstants.AcceleratorProfiles.PROFILE_3G_20GB,
+ 4.0 / 8, 3.0 / 7, 2));
+ a100_40_gb_profiles.add(new AcceleratorProfile(AnalyzerConstants.AcceleratorConstants.AcceleratorProfiles.PROFILE_4G_20GB,
+ 4.0 / 8, 4.0 / 7, 1));
+ a100_40_gb_profiles.add(new AcceleratorProfile(AnalyzerConstants.AcceleratorConstants.AcceleratorProfiles.PROFILE_7G_40GB,
+ 1.0, 1.0, 1));
- acceleratorProfilesMap.put(AnalyzerConstants.SupportedGPUs.A100_80_GB, new ArrayList<>(commonProfiles));
- acceleratorProfilesMap.put(AnalyzerConstants.SupportedGPUs.H100, new ArrayList<>(commonProfiles));
- acceleratorProfilesMap.put(AnalyzerConstants.SupportedGPUs.A100_40_GB, new ArrayList<>(a100_40_gb_profiles));
+ acceleratorProfilesMap.put(AnalyzerConstants.AcceleratorConstants.SupportedAccelerators.A100_80_GB, new ArrayList<>(commonProfiles));
+ acceleratorProfilesMap.put(AnalyzerConstants.AcceleratorConstants.SupportedAccelerators.H100, new ArrayList<>(commonProfiles));
+ acceleratorProfilesMap.put(AnalyzerConstants.AcceleratorConstants.SupportedAccelerators.A100_40_GB, new ArrayList<>(a100_40_gb_profiles));
}
public static AcceleratorMetaDataService getInstance() {
@@ -58,9 +83,9 @@ public AcceleratorProfile getAcceleratorProfile(String modelName, Double require
return null;
}
modelName = modelName.strip();
- if (!modelName.equalsIgnoreCase(AnalyzerConstants.SupportedGPUs.A100_80_GB)
- && !modelName.equalsIgnoreCase(AnalyzerConstants.SupportedGPUs.H100)
- && !modelName.equalsIgnoreCase(AnalyzerConstants.SupportedGPUs.A100_40_GB)) {
+ if (!modelName.equalsIgnoreCase(AnalyzerConstants.AcceleratorConstants.SupportedAccelerators.A100_80_GB)
+ && !modelName.equalsIgnoreCase(AnalyzerConstants.AcceleratorConstants.SupportedAccelerators.H100)
+ && !modelName.equalsIgnoreCase(AnalyzerConstants.AcceleratorConstants.SupportedAccelerators.A100_40_GB)) {
return null;
}
if (requiredMemoryFraction < 0.0 || requiredSmFraction < 0.0) {
diff --git a/src/main/java/com/autotune/common/data/system/info/device/accelerator/metadata/AcceleratorProfile.java b/src/main/java/com/autotune/common/data/system/info/device/accelerator/metadata/AcceleratorProfile.java
index 024d78e63..c0db82b50 100644
--- a/src/main/java/com/autotune/common/data/system/info/device/accelerator/metadata/AcceleratorProfile.java
+++ b/src/main/java/com/autotune/common/data/system/info/device/accelerator/metadata/AcceleratorProfile.java
@@ -1,11 +1,21 @@
package com.autotune.common.data.system.info.device.accelerator.metadata;
+/**
+ * Class which is used to store the details of an accelerator profile
+ */
public class AcceleratorProfile {
private final String profileName;
private final double memoryFraction;
private final double smFraction;
private final int instancesAvailable;
+ /**
+ * Constructor to create the Accelerator Profile
+ * @param profileName - Name of the profile
+ * @param memoryFraction - Fraction of memory out of the whole accelerator memory
+ * @param smFraction - Fraction of Cores or Streaming Processors out if the whole accelerator cores
+ * @param instancesAvailable - Number of instances of a profile available on an Accelerator
+ */
public AcceleratorProfile(String profileName, double memoryFraction, double smFraction, int instancesAvailable) {
this.profileName = profileName;
this.memoryFraction = memoryFraction;
@@ -14,7 +24,7 @@ public AcceleratorProfile(String profileName, double memoryFraction, double smFr
}
public String getProfileName() {
- return profileName;
+ return this.profileName;
}
public double getMemoryFraction() {
From e699630c0ca92af185fdf2a7dcbb798830dae7e5 Mon Sep 17 00:00:00 2001
From: bharathappali
Date: Fri, 4 Oct 2024 01:21:44 +0530
Subject: [PATCH 10/64] Add accelerator related fields in existing structures
Signed-off-by: bharathappali
---
.../data/metrics/AcceleratorMetricResult.java | 29 +++++++++++++++++++
.../common/data/result/ContainerData.java | 10 +++++++
.../common/data/result/IntervalResults.java | 10 +++++++
3 files changed, 49 insertions(+)
create mode 100644 src/main/java/com/autotune/common/data/metrics/AcceleratorMetricResult.java
diff --git a/src/main/java/com/autotune/common/data/metrics/AcceleratorMetricResult.java b/src/main/java/com/autotune/common/data/metrics/AcceleratorMetricResult.java
new file mode 100644
index 000000000..01f570ecb
--- /dev/null
+++ b/src/main/java/com/autotune/common/data/metrics/AcceleratorMetricResult.java
@@ -0,0 +1,29 @@
+package com.autotune.common.data.metrics;
+
+import com.autotune.common.data.system.info.device.accelerator.AcceleratorDeviceData;
+
+public class AcceleratorMetricResult {
+ private AcceleratorDeviceData acceleratorDeviceData;
+ private MetricResults metricResults;
+
+ public AcceleratorMetricResult(AcceleratorDeviceData acceleratorDeviceData, MetricResults metricResults) {
+ this.acceleratorDeviceData = acceleratorDeviceData;
+ this.metricResults = metricResults;
+ }
+
+ public AcceleratorDeviceData getAcceleratorDeviceData() {
+ return acceleratorDeviceData;
+ }
+
+ public void setAcceleratorDeviceData(AcceleratorDeviceData acceleratorDeviceData) {
+ this.acceleratorDeviceData = acceleratorDeviceData;
+ }
+
+ public MetricResults getMetricResults() {
+ return metricResults;
+ }
+
+ public void setMetricResults(MetricResults metricResults) {
+ this.metricResults = metricResults;
+ }
+}
diff --git a/src/main/java/com/autotune/common/data/result/ContainerData.java b/src/main/java/com/autotune/common/data/result/ContainerData.java
index 4f7afcc7f..66aa1dfc5 100644
--- a/src/main/java/com/autotune/common/data/result/ContainerData.java
+++ b/src/main/java/com/autotune/common/data/result/ContainerData.java
@@ -18,6 +18,7 @@
import com.autotune.analyzer.recommendations.ContainerRecommendations;
import com.autotune.analyzer.utils.AnalyzerConstants;
import com.autotune.common.data.metrics.Metric;
+import com.autotune.common.data.system.info.device.ContainerDeviceList;
import com.autotune.utils.KruizeConstants;
import com.google.gson.annotations.SerializedName;
@@ -29,6 +30,7 @@ public class ContainerData {
private String container_name;
//key is intervalEndTime
private HashMap results;
+ private ContainerDeviceList containerDeviceList;
@SerializedName(KruizeConstants.JSONKeys.RECOMMENDATIONS)
private ContainerRecommendations containerRecommendations;
private HashMap metrics;
@@ -85,6 +87,14 @@ public HashMap getMetrics() {
public void setMetrics(HashMap metrics) {
this.metrics = metrics;
}
+
+ public ContainerDeviceList getContainerDeviceList() {
+ return containerDeviceList;
+ }
+
+ public void setContainerDeviceList(ContainerDeviceList containerDeviceList) {
+ this.containerDeviceList = containerDeviceList;
+ }
@Override
public String toString() {
return "ContainerData{" +
diff --git a/src/main/java/com/autotune/common/data/result/IntervalResults.java b/src/main/java/com/autotune/common/data/result/IntervalResults.java
index e9bd880f3..327681690 100644
--- a/src/main/java/com/autotune/common/data/result/IntervalResults.java
+++ b/src/main/java/com/autotune/common/data/result/IntervalResults.java
@@ -16,6 +16,7 @@
package com.autotune.common.data.result;
import com.autotune.analyzer.utils.AnalyzerConstants;
+import com.autotune.common.data.metrics.AcceleratorMetricResult;
import com.autotune.common.data.metrics.MetricResults;
import com.google.gson.annotations.SerializedName;
@@ -32,6 +33,7 @@
public class IntervalResults {
@SerializedName(METRICS)
HashMap metricResultsMap;
+ HashMap acceleratorMetricResultHashMap;
@SerializedName(INTERVAL_START_TIME)
private Timestamp intervalStartTime;
@SerializedName(INTERVAL_END_TIME)
@@ -85,6 +87,14 @@ public void setDurationInMinutes(Double durationInMinutes) {
this.durationInMinutes = durationInMinutes;
}
+ public HashMap getAcceleratorMetricResultHashMap() {
+ return acceleratorMetricResultHashMap;
+ }
+
+ public void setAcceleratorMetricResultHashMap(HashMap acceleratorMetricResultHashMap) {
+ this.acceleratorMetricResultHashMap = acceleratorMetricResultHashMap;
+ }
+
@Override
public String toString() {
return "IntervalResults{" +
From 802183cb1db9b1fb74bf79756b95aba851390054 Mon Sep 17 00:00:00 2001
From: bharathappali
Date: Fri, 4 Oct 2024 01:36:00 +0530
Subject: [PATCH 11/64] Make changes to RecommendationItem enum to accomodate
MIG strings
Signed-off-by: bharathappali
---
.../engine/RecommendationEngine.java | 123 +++++++++---------
.../utils/RecommendationUtils.java | 24 ++--
.../analyzer/utils/AnalyzerConstants.java | 27 +++-
3 files changed, 95 insertions(+), 79 deletions(-)
diff --git a/src/main/java/com/autotune/analyzer/recommendations/engine/RecommendationEngine.java b/src/main/java/com/autotune/analyzer/recommendations/engine/RecommendationEngine.java
index bb9a202be..dd9683fbb 100644
--- a/src/main/java/com/autotune/analyzer/recommendations/engine/RecommendationEngine.java
+++ b/src/main/java/com/autotune/analyzer/recommendations/engine/RecommendationEngine.java
@@ -17,7 +17,6 @@
import com.autotune.analyzer.recommendations.utils.RecommendationUtils;
import com.autotune.analyzer.utils.AnalyzerConstants;
import com.autotune.analyzer.utils.AnalyzerErrorConstants;
-import com.autotune.analyzer.utils.ExperimentTypeUtil;
import com.autotune.common.data.ValidationOutputData;
import com.autotune.common.data.metrics.AggregationFunctions;
import com.autotune.common.data.metrics.Metric;
@@ -27,8 +26,6 @@
import com.autotune.common.data.result.IntervalResults;
import com.autotune.common.data.result.NamespaceData;
import com.autotune.common.datasource.DataSourceInfo;
-import com.autotune.common.auth.AuthenticationStrategy;
-import com.autotune.common.auth.AuthenticationStrategyFactory;
import com.autotune.common.exceptions.DataSourceNotExist;
import com.autotune.common.k8sObjects.K8sObject;
import com.autotune.common.utils.CommonUtils;
@@ -435,12 +432,12 @@ RecommendationConfigItem>> getCurrentConfigData(ContainerData containerData, Tim
if (null == configItem)
continue;
if (null == configItem.getAmount()) {
- if (recommendationItem.equals(AnalyzerConstants.RecommendationItem.cpu)) {
+ if (recommendationItem.equals(AnalyzerConstants.RecommendationItem.CPU)) {
notifications.add(RecommendationConstants.RecommendationNotification.ERROR_AMOUNT_MISSING_IN_CPU_SECTION);
LOGGER.error(RecommendationConstants.RecommendationNotificationMsgConstant.AMOUNT_MISSING_IN_CPU_SECTION
.concat(String.format(AnalyzerErrorConstants.AutotuneObjectErrors.EXPERIMENT_AND_INTERVAL_END_TIME,
experimentName, interval_end_time)));
- } else if (recommendationItem.equals((AnalyzerConstants.RecommendationItem.memory))) {
+ } else if (recommendationItem.equals((AnalyzerConstants.RecommendationItem.MEMORY))) {
notifications.add(RecommendationConstants.RecommendationNotification.ERROR_AMOUNT_MISSING_IN_MEMORY_SECTION);
LOGGER.error(RecommendationConstants.RecommendationNotificationMsgConstant.AMOUNT_MISSING_IN_MEMORY_SECTION
.concat(String.format(AnalyzerErrorConstants.AutotuneObjectErrors.EXPERIMENT_AND_INTERVAL_END_TIME,
@@ -449,12 +446,12 @@ RecommendationConfigItem>> getCurrentConfigData(ContainerData containerData, Tim
continue;
}
if (null == configItem.getFormat()) {
- if (recommendationItem.equals(AnalyzerConstants.RecommendationItem.cpu)) {
+ if (recommendationItem.equals(AnalyzerConstants.RecommendationItem.CPU)) {
notifications.add(RecommendationConstants.RecommendationNotification.ERROR_FORMAT_MISSING_IN_CPU_SECTION);
LOGGER.error(RecommendationConstants.RecommendationNotificationMsgConstant.FORMAT_MISSING_IN_CPU_SECTION
.concat(String.format(AnalyzerErrorConstants.AutotuneObjectErrors.EXPERIMENT_AND_INTERVAL_END_TIME,
experimentName, interval_end_time)));
- } else if (recommendationItem.equals((AnalyzerConstants.RecommendationItem.memory))) {
+ } else if (recommendationItem.equals((AnalyzerConstants.RecommendationItem.MEMORY))) {
notifications.add(RecommendationConstants.RecommendationNotification.ERROR_FORMAT_MISSING_IN_MEMORY_SECTION);
LOGGER.error(RecommendationConstants.RecommendationNotificationMsgConstant.FORMAT_MISSING_IN_MEMORY_SECTION
.concat(String.format(AnalyzerErrorConstants.AutotuneObjectErrors.EXPERIMENT_AND_INTERVAL_END_TIME,
@@ -463,12 +460,12 @@ RecommendationConfigItem>> getCurrentConfigData(ContainerData containerData, Tim
continue;
}
if (configItem.getAmount() <= 0.0) {
- if (recommendationItem.equals(AnalyzerConstants.RecommendationItem.cpu)) {
+ if (recommendationItem.equals(AnalyzerConstants.RecommendationItem.CPU)) {
notifications.add(RecommendationConstants.RecommendationNotification.ERROR_INVALID_AMOUNT_IN_CPU_SECTION);
LOGGER.error(RecommendationConstants.RecommendationNotificationMsgConstant.INVALID_AMOUNT_IN_CPU_SECTION
.concat(String.format(AnalyzerErrorConstants.AutotuneObjectErrors.EXPERIMENT_AND_INTERVAL_END_TIME,
experimentName, interval_end_time)));
- } else if (recommendationItem.equals((AnalyzerConstants.RecommendationItem.memory))) {
+ } else if (recommendationItem.equals((AnalyzerConstants.RecommendationItem.MEMORY))) {
notifications.add(RecommendationConstants.RecommendationNotification.ERROR_INVALID_AMOUNT_IN_MEMORY_SECTION);
LOGGER.error(RecommendationConstants.RecommendationNotificationMsgConstant.INVALID_AMOUNT_IN_MEMORY_SECTION
.concat(String.format(AnalyzerErrorConstants.AutotuneObjectErrors.EXPERIMENT_AND_INTERVAL_END_TIME,
@@ -477,12 +474,12 @@ RecommendationConfigItem>> getCurrentConfigData(ContainerData containerData, Tim
continue;
}
if (configItem.getFormat().isEmpty() || configItem.getFormat().isBlank()) {
- if (recommendationItem.equals(AnalyzerConstants.RecommendationItem.cpu)) {
+ if (recommendationItem.equals(AnalyzerConstants.RecommendationItem.CPU)) {
notifications.add(RecommendationConstants.RecommendationNotification.ERROR_INVALID_FORMAT_IN_CPU_SECTION);
LOGGER.error(RecommendationConstants.RecommendationNotificationMsgConstant.INVALID_FORMAT_IN_CPU_SECTION
.concat(String.format(AnalyzerErrorConstants.AutotuneObjectErrors.EXPERIMENT_AND_INTERVAL_END_TIME,
experimentName, interval_end_time)));
- } else if (recommendationItem.equals((AnalyzerConstants.RecommendationItem.memory))) {
+ } else if (recommendationItem.equals((AnalyzerConstants.RecommendationItem.MEMORY))) {
notifications.add(RecommendationConstants.RecommendationNotification.ERROR_INVALID_FORMAT_IN_MEMORY_SECTION);
LOGGER.error(RecommendationConstants.RecommendationNotificationMsgConstant.INVALID_FORMAT_IN_MEMORY_SECTION
.concat(String.format(AnalyzerErrorConstants.AutotuneObjectErrors.EXPERIMENT_AND_INTERVAL_END_TIME,
@@ -668,20 +665,20 @@ private MappedRecommendationForModel generateRecommendationBasedOnModel(Timestam
if (currentConfigMap.containsKey(AnalyzerConstants.ResourceSetting.requests) && null != currentConfigMap.get(AnalyzerConstants.ResourceSetting.requests)) {
HashMap requestsMap = currentConfigMap.get(AnalyzerConstants.ResourceSetting.requests);
- if (requestsMap.containsKey(AnalyzerConstants.RecommendationItem.cpu) && null != requestsMap.get(AnalyzerConstants.RecommendationItem.cpu)) {
- currentCPURequest = requestsMap.get(AnalyzerConstants.RecommendationItem.cpu);
+ if (requestsMap.containsKey(AnalyzerConstants.RecommendationItem.CPU) && null != requestsMap.get(AnalyzerConstants.RecommendationItem.CPU)) {
+ currentCPURequest = requestsMap.get(AnalyzerConstants.RecommendationItem.CPU);
}
- if (requestsMap.containsKey(AnalyzerConstants.RecommendationItem.memory) && null != requestsMap.get(AnalyzerConstants.RecommendationItem.memory)) {
- currentMemRequest = requestsMap.get(AnalyzerConstants.RecommendationItem.memory);
+ if (requestsMap.containsKey(AnalyzerConstants.RecommendationItem.MEMORY) && null != requestsMap.get(AnalyzerConstants.RecommendationItem.MEMORY)) {
+ currentMemRequest = requestsMap.get(AnalyzerConstants.RecommendationItem.MEMORY);
}
}
if (currentConfigMap.containsKey(AnalyzerConstants.ResourceSetting.limits) && null != currentConfigMap.get(AnalyzerConstants.ResourceSetting.limits)) {
HashMap limitsMap = currentConfigMap.get(AnalyzerConstants.ResourceSetting.limits);
- if (limitsMap.containsKey(AnalyzerConstants.RecommendationItem.cpu) && null != limitsMap.get(AnalyzerConstants.RecommendationItem.cpu)) {
- currentCPULimit = limitsMap.get(AnalyzerConstants.RecommendationItem.cpu);
+ if (limitsMap.containsKey(AnalyzerConstants.RecommendationItem.CPU) && null != limitsMap.get(AnalyzerConstants.RecommendationItem.CPU)) {
+ currentCPULimit = limitsMap.get(AnalyzerConstants.RecommendationItem.CPU);
}
- if (limitsMap.containsKey(AnalyzerConstants.RecommendationItem.memory) && null != limitsMap.get(AnalyzerConstants.RecommendationItem.memory)) {
- currentMemLimit = limitsMap.get(AnalyzerConstants.RecommendationItem.memory);
+ if (limitsMap.containsKey(AnalyzerConstants.RecommendationItem.MEMORY) && null != limitsMap.get(AnalyzerConstants.RecommendationItem.MEMORY)) {
+ currentMemLimit = limitsMap.get(AnalyzerConstants.RecommendationItem.MEMORY);
}
}
if (null != monitoringStartTime) {
@@ -826,40 +823,40 @@ private HashMap requestsMap = currentNamespaceConfigMap.get(AnalyzerConstants.ResourceSetting.requests);
- if (requestsMap.containsKey(AnalyzerConstants.RecommendationItem.cpu) && null != requestsMap.get(AnalyzerConstants.RecommendationItem.cpu)) {
- currentNamespaceCPURequest = requestsMap.get(AnalyzerConstants.RecommendationItem.cpu);
+ if (requestsMap.containsKey(AnalyzerConstants.RecommendationItem.CPU) && null != requestsMap.get(AnalyzerConstants.RecommendationItem.CPU)) {
+ currentNamespaceCPURequest = requestsMap.get(AnalyzerConstants.RecommendationItem.CPU);
}
- if (requestsMap.containsKey(AnalyzerConstants.RecommendationItem.memory) && null != requestsMap.get(AnalyzerConstants.RecommendationItem.memory)) {
- currentNamespaceMemRequest = requestsMap.get(AnalyzerConstants.RecommendationItem.memory);
+ if (requestsMap.containsKey(AnalyzerConstants.RecommendationItem.MEMORY) && null != requestsMap.get(AnalyzerConstants.RecommendationItem.MEMORY)) {
+ currentNamespaceMemRequest = requestsMap.get(AnalyzerConstants.RecommendationItem.MEMORY);
}
}
if (currentNamespaceConfigMap.containsKey(AnalyzerConstants.ResourceSetting.limits) && null != currentNamespaceConfigMap.get(AnalyzerConstants.ResourceSetting.limits)) {
HashMap limitsMap = currentNamespaceConfigMap.get(AnalyzerConstants.ResourceSetting.limits);
- if (limitsMap.containsKey(AnalyzerConstants.RecommendationItem.cpu) && null != limitsMap.get(AnalyzerConstants.RecommendationItem.cpu)) {
- currentNamespaceCPULimit = limitsMap.get(AnalyzerConstants.RecommendationItem.cpu);
+ if (limitsMap.containsKey(AnalyzerConstants.RecommendationItem.CPU) && null != limitsMap.get(AnalyzerConstants.RecommendationItem.CPU)) {
+ currentNamespaceCPULimit = limitsMap.get(AnalyzerConstants.RecommendationItem.CPU);
}
- if (limitsMap.containsKey(AnalyzerConstants.RecommendationItem.memory) && null != limitsMap.get(AnalyzerConstants.RecommendationItem.memory)) {
- currentNamespaceMemLimit = limitsMap.get(AnalyzerConstants.RecommendationItem.memory);
+ if (limitsMap.containsKey(AnalyzerConstants.RecommendationItem.MEMORY) && null != limitsMap.get(AnalyzerConstants.RecommendationItem.MEMORY)) {
+ currentNamespaceMemLimit = limitsMap.get(AnalyzerConstants.RecommendationItem.MEMORY);
}
}
if (null != monitoringStartTime) {
@@ -1273,7 +1270,7 @@ private boolean populateRecommendation(Map.Entry termEntry,
generatedCpuRequestFormat = recommendationCpuRequest.getFormat();
if (null != generatedCpuRequestFormat && !generatedCpuRequestFormat.isEmpty()) {
isRecommendedCPURequestAvailable = true;
- requestsMap.put(AnalyzerConstants.RecommendationItem.cpu, recommendationCpuRequest);
+ requestsMap.put(AnalyzerConstants.RecommendationItem.CPU, recommendationCpuRequest);
} else {
RecommendationNotification recommendationNotification = new RecommendationNotification(RecommendationConstants.RecommendationNotification.ERROR_FORMAT_MISSING_IN_CPU_SECTION);
notifications.add(recommendationNotification);
@@ -1289,7 +1286,7 @@ private boolean populateRecommendation(Map.Entry termEntry,
generatedMemRequestFormat = recommendationMemRequest.getFormat();
if (null != generatedMemRequestFormat && !generatedMemRequestFormat.isEmpty()) {
isRecommendedMemoryRequestAvailable = true;
- requestsMap.put(AnalyzerConstants.RecommendationItem.memory, recommendationMemRequest);
+ requestsMap.put(AnalyzerConstants.RecommendationItem.MEMORY, recommendationMemRequest);
} else {
RecommendationNotification recommendationNotification = new RecommendationNotification(RecommendationConstants.RecommendationNotification.ERROR_FORMAT_MISSING_IN_MEMORY_SECTION);
notifications.add(recommendationNotification);
@@ -1325,7 +1322,7 @@ private boolean populateRecommendation(Map.Entry termEntry,
generatedCpuLimitFormat = recommendationCpuLimits.getFormat();
if (null != generatedCpuLimitFormat && !generatedCpuLimitFormat.isEmpty()) {
isRecommendedCPULimitAvailable = true;
- limitsMap.put(AnalyzerConstants.RecommendationItem.cpu, recommendationCpuLimits);
+ limitsMap.put(AnalyzerConstants.RecommendationItem.CPU, recommendationCpuLimits);
} else {
RecommendationNotification recommendationNotification = new RecommendationNotification(RecommendationConstants.RecommendationNotification.ERROR_FORMAT_MISSING_IN_CPU_SECTION);
notifications.add(recommendationNotification);
@@ -1341,7 +1338,7 @@ private boolean populateRecommendation(Map.Entry termEntry,
generatedMemLimitFormat = recommendationMemLimits.getFormat();
if (null != generatedMemLimitFormat && !generatedMemLimitFormat.isEmpty()) {
isRecommendedMemoryLimitAvailable = true;
- limitsMap.put(AnalyzerConstants.RecommendationItem.memory, recommendationMemLimits);
+ limitsMap.put(AnalyzerConstants.RecommendationItem.MEMORY, recommendationMemLimits);
} else {
RecommendationNotification recommendationNotification = new RecommendationNotification(RecommendationConstants.RecommendationNotification.ERROR_FORMAT_MISSING_IN_MEMORY_SECTION);
notifications.add(recommendationNotification);
@@ -1373,7 +1370,7 @@ private boolean populateRecommendation(Map.Entry termEntry,
experimentName, interval_end_time)));
} else {
isCurrentCPURequestAvailable = true;
- currentRequestsMap.put(AnalyzerConstants.RecommendationItem.cpu, currentCpuRequest);
+ currentRequestsMap.put(AnalyzerConstants.RecommendationItem.CPU, currentCpuRequest);
}
}
@@ -1393,7 +1390,7 @@ private boolean populateRecommendation(Map.Entry termEntry,
experimentName, interval_end_time)));
} else {
isCurrentMemoryRequestAvailable = true;
- currentRequestsMap.put(AnalyzerConstants.RecommendationItem.memory, currentMemRequest);
+ currentRequestsMap.put(AnalyzerConstants.RecommendationItem.MEMORY, currentMemRequest);
}
}
@@ -1416,7 +1413,7 @@ private boolean populateRecommendation(Map.Entry termEntry,
experimentName, interval_end_time)));
} else {
isCurrentCPULimitAvailable = true;
- currentLimitsMap.put(AnalyzerConstants.RecommendationItem.cpu, currentCpuLimit);
+ currentLimitsMap.put(AnalyzerConstants.RecommendationItem.CPU, currentCpuLimit);
}
}
@@ -1436,7 +1433,7 @@ private boolean populateRecommendation(Map.Entry termEntry,
experimentName, interval_end_time)));
} else {
isCurrentMemoryLimitAvailable = true;
- currentLimitsMap.put(AnalyzerConstants.RecommendationItem.memory, currentMemLimit);
+ currentLimitsMap.put(AnalyzerConstants.RecommendationItem.MEMORY, currentMemLimit);
}
}
@@ -1454,7 +1451,7 @@ private boolean populateRecommendation(Map.Entry termEntry,
// TODO: If difference is positive it can be considered as under-provisioning, Need to handle it better
isVariationCPURequestAvailable = true;
variationCpuRequest = new RecommendationConfigItem(diff, generatedCpuRequestFormat);
- requestsVariationMap.put(AnalyzerConstants.RecommendationItem.cpu, variationCpuRequest);
+ requestsVariationMap.put(AnalyzerConstants.RecommendationItem.CPU, variationCpuRequest);
}
double currentMemRequestValue = 0.0;
@@ -1466,7 +1463,7 @@ private boolean populateRecommendation(Map.Entry termEntry,
// TODO: If difference is positive it can be considered as under-provisioning, Need to handle it better
isVariationMemoryRequestAvailable = true;
variationMemRequest = new RecommendationConfigItem(diff, generatedMemRequestFormat);
- requestsVariationMap.put(AnalyzerConstants.RecommendationItem.memory, variationMemRequest);
+ requestsVariationMap.put(AnalyzerConstants.RecommendationItem.MEMORY, variationMemRequest);
}
// Create a new map for storing variation in limits
@@ -1483,7 +1480,7 @@ private boolean populateRecommendation(Map.Entry termEntry,
double diff = generatedCpuLimit - currentCpuLimitValue;
isVariationCPULimitAvailable = true;
variationCpuLimit = new RecommendationConfigItem(diff, generatedCpuLimitFormat);
- limitsVariationMap.put(AnalyzerConstants.RecommendationItem.cpu, variationCpuLimit);
+ limitsVariationMap.put(AnalyzerConstants.RecommendationItem.CPU, variationCpuLimit);
}
double currentMemLimitValue = 0.0;
@@ -1494,7 +1491,7 @@ private boolean populateRecommendation(Map.Entry termEntry,
double diff = generatedMemLimit - currentMemLimitValue;
isVariationMemoryLimitAvailable = true;
variationMemLimit = new RecommendationConfigItem(diff, generatedMemLimitFormat);
- limitsVariationMap.put(AnalyzerConstants.RecommendationItem.memory, variationMemLimit);
+ limitsVariationMap.put(AnalyzerConstants.RecommendationItem.MEMORY, variationMemLimit);
}
// build the engine level notifications here
@@ -1535,23 +1532,23 @@ private boolean populateRecommendation(Map.Entry termEntry,
// Alternative - CPU REQUEST VALUE
// Accessing existing recommendation item
- RecommendationConfigItem tempAccessedRecCPURequest = requestsMap.get(AnalyzerConstants.RecommendationItem.cpu);
+ RecommendationConfigItem tempAccessedRecCPURequest = requestsMap.get(AnalyzerConstants.RecommendationItem.CPU);
if (null != tempAccessedRecCPURequest) {
// Updating it with desired value
tempAccessedRecCPURequest.setAmount(currentCpuRequestValue);
}
// Replace the updated object (Step not needed as we are updating existing object, but just to make sure it's updated)
- requestsMap.put(AnalyzerConstants.RecommendationItem.cpu, tempAccessedRecCPURequest);
+ requestsMap.put(AnalyzerConstants.RecommendationItem.CPU, tempAccessedRecCPURequest);
// Alternative - CPU REQUEST VARIATION VALUE
// Accessing existing recommendation item
- RecommendationConfigItem tempAccessedRecCPURequestVariation = requestsVariationMap.get(AnalyzerConstants.RecommendationItem.cpu);
+ RecommendationConfigItem tempAccessedRecCPURequestVariation = requestsVariationMap.get(AnalyzerConstants.RecommendationItem.CPU);
if (null != tempAccessedRecCPURequestVariation) {
// Updating it with desired value (as we are setting to current variation would be 0)
tempAccessedRecCPURequestVariation.setAmount(CPU_ZERO);
}
// Replace the updated object (Step not needed as we are updating existing object, but just to make sure it's updated)
- requestsVariationMap.put(AnalyzerConstants.RecommendationItem.cpu, tempAccessedRecCPURequestVariation);
+ requestsVariationMap.put(AnalyzerConstants.RecommendationItem.CPU, tempAccessedRecCPURequestVariation);
RecommendationNotification recommendationNotification = new RecommendationNotification(RecommendationConstants.RecommendationNotification.NOTICE_CPU_REQUESTS_OPTIMISED);
engineNotifications.add(recommendationNotification);
@@ -1575,23 +1572,23 @@ private boolean populateRecommendation(Map.Entry termEntry,
// Alternative - CPU LIMIT VALUE
// Accessing existing recommendation item
- RecommendationConfigItem tempAccessedRecCPULimit = limitsMap.get(AnalyzerConstants.RecommendationItem.cpu);
+ RecommendationConfigItem tempAccessedRecCPULimit = limitsMap.get(AnalyzerConstants.RecommendationItem.CPU);
if (null != tempAccessedRecCPULimit) {
// Updating it with desired value
tempAccessedRecCPULimit.setAmount(currentCpuLimitValue);
}
// Replace the updated object (Step not needed as we are updating existing object, but just to make sure it's updated)
- limitsMap.put(AnalyzerConstants.RecommendationItem.cpu, tempAccessedRecCPULimit);
+ limitsMap.put(AnalyzerConstants.RecommendationItem.CPU, tempAccessedRecCPULimit);
// Alternative - CPU LIMIT VARIATION VALUE
// Accessing existing recommendation item
- RecommendationConfigItem tempAccessedRecCPULimitVariation = limitsVariationMap.get(AnalyzerConstants.RecommendationItem.cpu);
+ RecommendationConfigItem tempAccessedRecCPULimitVariation = limitsVariationMap.get(AnalyzerConstants.RecommendationItem.CPU);
if (null != tempAccessedRecCPULimitVariation) {
// Updating it with desired value (as we are setting to current variation would be 0)
tempAccessedRecCPULimitVariation.setAmount(CPU_ZERO);
}
// Replace the updated object (Step not needed as we are updating existing object, but just to make sure it's updated)
- limitsVariationMap.put(AnalyzerConstants.RecommendationItem.cpu, tempAccessedRecCPULimitVariation);
+ limitsVariationMap.put(AnalyzerConstants.RecommendationItem.CPU, tempAccessedRecCPULimitVariation);
RecommendationNotification recommendationNotification = new RecommendationNotification(RecommendationConstants.RecommendationNotification.NOTICE_CPU_LIMITS_OPTIMISED);
engineNotifications.add(recommendationNotification);
@@ -1615,23 +1612,23 @@ private boolean populateRecommendation(Map.Entry termEntry,
// Alternative - MEMORY REQUEST VALUE
// Accessing existing recommendation item
- RecommendationConfigItem tempAccessedRecMemoryRequest = requestsMap.get(AnalyzerConstants.RecommendationItem.memory);
+ RecommendationConfigItem tempAccessedRecMemoryRequest = requestsMap.get(AnalyzerConstants.RecommendationItem.MEMORY);
if (null != tempAccessedRecMemoryRequest) {
// Updating it with desired value
tempAccessedRecMemoryRequest.setAmount(currentMemRequestValue);
}
// Replace the updated object (Step not needed as we are updating existing object, but just to make sure it's updated)
- requestsMap.put(AnalyzerConstants.RecommendationItem.memory, tempAccessedRecMemoryRequest);
+ requestsMap.put(AnalyzerConstants.RecommendationItem.MEMORY, tempAccessedRecMemoryRequest);
// Alternative - MEMORY REQUEST VARIATION VALUE
// Accessing existing recommendation item
- RecommendationConfigItem tempAccessedRecMemoryRequestVariation = requestsVariationMap.get(AnalyzerConstants.RecommendationItem.memory);
+ RecommendationConfigItem tempAccessedRecMemoryRequestVariation = requestsVariationMap.get(AnalyzerConstants.RecommendationItem.MEMORY);
if (null != tempAccessedRecMemoryRequestVariation) {
// Updating it with desired value (as we are setting to current variation would be 0)
tempAccessedRecMemoryRequestVariation.setAmount(MEM_ZERO);
}
// Replace the updated object (Step not needed as we are updating existing object, but just to make sure it's updated)
- requestsVariationMap.put(AnalyzerConstants.RecommendationItem.memory, tempAccessedRecMemoryRequestVariation);
+ requestsVariationMap.put(AnalyzerConstants.RecommendationItem.MEMORY, tempAccessedRecMemoryRequestVariation);
RecommendationNotification recommendationNotification = new RecommendationNotification(RecommendationConstants.RecommendationNotification.NOTICE_MEMORY_REQUESTS_OPTIMISED);
engineNotifications.add(recommendationNotification);
@@ -1655,23 +1652,23 @@ private boolean populateRecommendation(Map.Entry termEntry,
// Alternative - MEMORY LIMIT VALUE
// Accessing existing recommendation item
- RecommendationConfigItem tempAccessedRecMemoryLimit = limitsMap.get(AnalyzerConstants.RecommendationItem.memory);
+ RecommendationConfigItem tempAccessedRecMemoryLimit = limitsMap.get(AnalyzerConstants.RecommendationItem.MEMORY);
if (null != tempAccessedRecMemoryLimit) {
// Updating it with desired value
tempAccessedRecMemoryLimit.setAmount(currentMemLimitValue);
}
// Replace the updated object (Step not needed as we are updating existing object, but just to make sure it's updated)
- limitsMap.put(AnalyzerConstants.RecommendationItem.memory, tempAccessedRecMemoryLimit);
+ limitsMap.put(AnalyzerConstants.RecommendationItem.MEMORY, tempAccessedRecMemoryLimit);
// Alternative - MEMORY LIMIT VARIATION VALUE
// Accessing existing recommendation item
- RecommendationConfigItem tempAccessedRecMemoryLimitVariation = limitsVariationMap.get(AnalyzerConstants.RecommendationItem.memory);
+ RecommendationConfigItem tempAccessedRecMemoryLimitVariation = limitsVariationMap.get(AnalyzerConstants.RecommendationItem.MEMORY);
if (null != tempAccessedRecMemoryLimitVariation) {
// Updating it with desired value (as we are setting to current variation would be 0)
tempAccessedRecMemoryLimitVariation.setAmount(MEM_ZERO);
}
// Replace the updated object (Step not needed as we are updating existing object, but just to make sure it's updated)
- limitsVariationMap.put(AnalyzerConstants.RecommendationItem.memory, tempAccessedRecMemoryLimitVariation);
+ limitsVariationMap.put(AnalyzerConstants.RecommendationItem.MEMORY, tempAccessedRecMemoryLimitVariation);
RecommendationNotification recommendationNotification = new RecommendationNotification(RecommendationConstants.RecommendationNotification.NOTICE_MEMORY_LIMITS_OPTIMISED);
engineNotifications.add(recommendationNotification);
diff --git a/src/main/java/com/autotune/analyzer/recommendations/utils/RecommendationUtils.java b/src/main/java/com/autotune/analyzer/recommendations/utils/RecommendationUtils.java
index 2deac4110..158935eed 100644
--- a/src/main/java/com/autotune/analyzer/recommendations/utils/RecommendationUtils.java
+++ b/src/main/java/com/autotune/analyzer/recommendations/utils/RecommendationUtils.java
@@ -2,15 +2,11 @@
import com.autotune.analyzer.recommendations.RecommendationConfigItem;
import com.autotune.analyzer.recommendations.RecommendationConstants;
-import com.autotune.analyzer.recommendations.RecommendationNotification;
import com.autotune.analyzer.utils.AnalyzerConstants;
import com.autotune.common.data.metrics.MetricResults;
-import com.autotune.common.data.result.ContainerData;
import com.autotune.common.data.result.IntervalResults;
-import com.autotune.utils.KruizeConstants;
import java.sql.Timestamp;
-import java.time.LocalDateTime;
import java.util.*;
public class RecommendationUtils {
@@ -28,15 +24,15 @@ public static RecommendationConfigItem getCurrentValue(Map
Date: Mon, 7 Oct 2024 12:45:51 +0530
Subject: [PATCH 12/64] renamed jobID to job_id
Signed-off-by: msvinaykumar
---
design/BulkAPI.md | 14 +++++++-------
1 file changed, 7 insertions(+), 7 deletions(-)
diff --git a/design/BulkAPI.md b/design/BulkAPI.md
index 095149ce9..4faec7a1f 100644
--- a/design/BulkAPI.md
+++ b/design/BulkAPI.md
@@ -5,18 +5,18 @@ containers, namespaces, etc., for a cluster connected via the datasource integra
be configured using filters like exclude/include namespaces, workloads, containers, or labels for generating
recommendations. It also has settings to generate recommendations at both the container or namespace level, or both.
-Bulk returns a `jobID` as a response to track the job status. The user can use the `jobID` to monitor the
+Bulk returns a `job_id` as a response to track the job status. The user can use the `job_id` to monitor the
progress of the job.
## Task Flow When Bulk Is Invoked
-1. Returns a unique `jobID`.
+1. Returns a unique `job_id`.
2. Background Bulk:
- First, does a handshake with the datasource.
- Using queries, it fetches the list of namespaces, workloads, containers of the connected datasource.
- Creates experiments, one for each container *alpha release.
- Triggers `generateRecommendations` for each container.
- - Once all experiments are created, and recommendations are generated, the system marks the `jobID` as "COMPLETED".
+ - Once all experiments are created, and recommendations are generated, the system marks the `job_id` as "COMPLETED".
## API Specification
@@ -79,21 +79,21 @@ progress of the job.
```json
{
- "jobid": "123e4567-e89b-12d3-a456-426614174000"
+ "job_id": "123e4567-e89b-12d3-a456-426614174000"
}
```
### GET Request:
```bash
-GET /bulk?jobid=123e4567-e89b-12d3-a456-426614174000
+GET /bulk?job_id=123e4567-e89b-12d3-a456-426614174000
```
**Body (JSON):**
```json
{
- "jobID": "123e4567-e89b-12d3-a456-426614174000",
+ "job_id": "123e4567-e89b-12d3-a456-426614174000",
"status": "IN-PROGRESS",
"progress": 30,
"data": {
@@ -136,7 +136,7 @@ GET /bulk?jobid=123e4567-e89b-12d3-a456-426614174000
### Response Parameters
-- **jobID:** Unique identifier for the job.
+- **job_id:** Unique identifier for the job.
- **status:** Current status of the job. Possible values: `"IN-PROGRESS"`, `"COMPLETED"`, `"FAILED"`.
- **progress:** Percentage of job completion.
- **data:** Contains detailed information about the experiments and recommendations.
From 39f797e3721d93435f7011bf78965ea3d5b49b4c Mon Sep 17 00:00:00 2001
From: bharathappali
Date: Fri, 4 Oct 2024 08:08:58 +0530
Subject: [PATCH 13/64] Add changes to fetch Accelerator metrics at the time of
recommendation generation
Signed-off-by: bharathappali
---
.../engine/RecommendationEngine.java | 234 +++++++++++++++---
.../analyzer/utils/AnalyzerConstants.java | 8 +
.../autotune/common/utils/CommonUtils.java | 186 +++++++++++++-
.../com/autotune/utils/KruizeConstants.java | 5 +
4 files changed, 392 insertions(+), 41 deletions(-)
diff --git a/src/main/java/com/autotune/analyzer/recommendations/engine/RecommendationEngine.java b/src/main/java/com/autotune/analyzer/recommendations/engine/RecommendationEngine.java
index dd9683fbb..81ad283b9 100644
--- a/src/main/java/com/autotune/analyzer/recommendations/engine/RecommendationEngine.java
+++ b/src/main/java/com/autotune/analyzer/recommendations/engine/RecommendationEngine.java
@@ -18,13 +18,12 @@
import com.autotune.analyzer.utils.AnalyzerConstants;
import com.autotune.analyzer.utils.AnalyzerErrorConstants;
import com.autotune.common.data.ValidationOutputData;
-import com.autotune.common.data.metrics.AggregationFunctions;
-import com.autotune.common.data.metrics.Metric;
-import com.autotune.common.data.metrics.MetricAggregationInfoResults;
-import com.autotune.common.data.metrics.MetricResults;
+import com.autotune.common.data.metrics.*;
import com.autotune.common.data.result.ContainerData;
import com.autotune.common.data.result.IntervalResults;
import com.autotune.common.data.result.NamespaceData;
+import com.autotune.common.data.system.info.device.DeviceDetails;
+import com.autotune.common.data.system.info.device.accelerator.AcceleratorDeviceData;
import com.autotune.common.datasource.DataSourceInfo;
import com.autotune.common.exceptions.DataSourceNotExist;
import com.autotune.common.k8sObjects.K8sObject;
@@ -1805,9 +1804,17 @@ public void fetchMetricsBasedOnProfileAndDatasource(KruizeObject kruizeObject, T
}
String maxDateQuery = null;
+ String acceleratorDetectionQuery = null;
if (kruizeObject.isContainerExperiment()) {
maxDateQuery = getMaxDateQuery(metricProfile, AnalyzerConstants.MetricName.maxDate.name());
- fetchContainerMetricsBasedOnDataSourceAndProfile(kruizeObject, interval_end_time, interval_start_time, dataSourceInfo, metricProfile, maxDateQuery);
+ acceleratorDetectionQuery = getMaxDateQuery(metricProfile, AnalyzerConstants.MetricName.gpuMemoryUsage.name());
+ fetchContainerMetricsBasedOnDataSourceAndProfile(kruizeObject,
+ interval_end_time,
+ interval_start_time,
+ dataSourceInfo,
+ metricProfile,
+ maxDateQuery,
+ acceleratorDetectionQuery);
} else if (kruizeObject.isNamespaceExperiment()) {
maxDateQuery = getMaxDateQuery(metricProfile, AnalyzerConstants.MetricName.namespaceMaxDate.name());
fetchNamespaceMetricsBasedOnDataSourceAndProfile(kruizeObject, interval_end_time, interval_start_time, dataSourceInfo, metricProfile, maxDateQuery);
@@ -1975,7 +1982,7 @@ private void fetchNamespaceMetricsBasedOnDataSourceAndProfile(KruizeObject kruiz
/**
- * Fetches namespace metrics based on the specified datasource using queries from the metricProfile for the given time interval.
+ * Fetches Container metrics based on the specified datasource using queries from the metricProfile for the given time interval.
*
* @param kruizeObject KruizeObject
* @param interval_end_time The end time of the interval in the format yyyy-MM-ddTHH:mm:sssZ
@@ -1985,7 +1992,13 @@ private void fetchNamespaceMetricsBasedOnDataSourceAndProfile(KruizeObject kruiz
* @param maxDateQuery max date query for containers
* @throws Exception
*/
- private void fetchContainerMetricsBasedOnDataSourceAndProfile(KruizeObject kruizeObject, Timestamp interval_end_time, Timestamp interval_start_time, DataSourceInfo dataSourceInfo, PerformanceProfile metricProfile, String maxDateQuery) throws Exception, FetchMetricsError {
+ private void fetchContainerMetricsBasedOnDataSourceAndProfile(KruizeObject kruizeObject,
+ Timestamp interval_end_time,
+ Timestamp interval_start_time,
+ DataSourceInfo dataSourceInfo,
+ PerformanceProfile metricProfile,
+ String maxDateQuery,
+ String acceleratorDetectionQuery) throws Exception, FetchMetricsError {
try {
long interval_end_time_epoc = 0;
long interval_start_time_epoc = 0;
@@ -2004,6 +2017,20 @@ private void fetchContainerMetricsBasedOnDataSourceAndProfile(KruizeObject kruiz
for (Map.Entry entry : containerDataMap.entrySet()) {
ContainerData containerData = entry.getValue();
+
+ // Check if the container data has Accelerator support else check for Accelerator metrics
+ if (null == containerData.getContainerDeviceList() || !containerData.getContainerDeviceList().isAcceleratorDeviceDetected()) {
+ CommonUtils.markAcceleratorDeviceStatusToContainer(containerData,
+ maxDateQuery,
+ namespace,
+ workload,
+ workload_type,
+ dataSourceInfo,
+ kruizeObject.getTerms(),
+ measurementDurationMinutesInDouble,
+ acceleratorDetectionQuery);
+ }
+
String containerName = containerData.getContainer_name();
if (null == interval_end_time) {
LOGGER.info(KruizeConstants.APIMessages.CONTAINER_USAGE_INFO);
@@ -2055,20 +2082,46 @@ private void fetchContainerMetricsBasedOnDataSourceAndProfile(KruizeObject kruiz
HashMap containerDataResults = new HashMap<>();
IntervalResults intervalResults = null;
HashMap resMap = null;
- HashMap resultMap = null;
+ HashMap acceleratorMetricResultHashMap;
MetricResults metricResults = null;
MetricAggregationInfoResults metricAggregationInfoResults = null;
List metricList = metricProfile.getSloInfo().getFunctionVariables();
+ List acceleratorFunctions = Arrays.asList(
+ AnalyzerConstants.MetricName.gpuCoreUsage.toString(),
+ AnalyzerConstants.MetricName.gpuMemoryUsage.toString()
+ );
// Iterate over metrics and aggregation functions
for (Metric metricEntry : metricList) {
+
+ boolean isAcceleratorMetric = false;
+ boolean fetchAcceleratorMetrics = false;
+
+ if (acceleratorFunctions.contains(metricEntry.getName())) {
+ isAcceleratorMetric = true;
+ }
+
+ if (isAcceleratorMetric
+ && null != containerData.getContainerDeviceList()
+ && containerData.getContainerDeviceList().isAcceleratorDeviceDetected()) {
+ fetchAcceleratorMetrics = true;
+ }
+
+ // Skip fetching Accelerator metrics if the workload doesn't use Accelerator
+ if (isAcceleratorMetric && !fetchAcceleratorMetrics)
+ continue;
+
HashMap aggregationFunctions = metricEntry.getAggregationFunctionsMap();
for (Map.Entry aggregationFunctionsEntry: aggregationFunctions.entrySet()) {
// Determine promQL query on metric type
String promQL = aggregationFunctionsEntry.getValue().getQuery();
- String format = null;
+ // Skipping if the promQL is empty
+ if (null == promQL || promQL.isEmpty())
+ continue;
+
+ String format = null;
// Determine format based on metric type - Todo move this metric profile
List cpuFunction = Arrays.asList(AnalyzerConstants.MetricName.cpuUsage.toString(), AnalyzerConstants.MetricName.cpuThrottle.toString(), AnalyzerConstants.MetricName.cpuLimit.toString(), AnalyzerConstants.MetricName.cpuRequest.toString());
@@ -2077,8 +2130,11 @@ private void fetchContainerMetricsBasedOnDataSourceAndProfile(KruizeObject kruiz
format = KruizeConstants.JSONKeys.CORES;
} else if (memFunction.contains(metricEntry.getName())) {
format = KruizeConstants.JSONKeys.BYTES;
+ } else if (isAcceleratorMetric) {
+ format = KruizeConstants.JSONKeys.CORES;
}
+ // If promQL is determined, fetch metrics from the datasource
promQL = promQL
.replace(AnalyzerConstants.NAMESPACE_VARIABLE, namespace)
.replace(AnalyzerConstants.CONTAINER_VARIABLE, containerName)
@@ -2086,48 +2142,150 @@ private void fetchContainerMetricsBasedOnDataSourceAndProfile(KruizeObject kruiz
.replace(AnalyzerConstants.WORKLOAD_VARIABLE, workload)
.replace(AnalyzerConstants.WORKLOAD_TYPE_VARIABLE, workload_type);
- // If promQL is determined, fetch metrics from the datasource
- if (promQL != null) {
- LOGGER.info(promQL);
- String podMetricsUrl;
- try {
- podMetricsUrl = String.format(KruizeConstants.DataSourceConstants.DATASOURCE_ENDPOINT_WITH_QUERY,
- dataSourceInfo.getUrl(),
- URLEncoder.encode(promQL, CHARACTER_ENCODING),
- interval_start_time_epoc,
- interval_end_time_epoc,
- measurementDurationMinutesInDouble.intValue() * KruizeConstants.TimeConv.NO_OF_SECONDS_PER_MINUTE);
- LOGGER.info(podMetricsUrl);
- client.setBaseURL(podMetricsUrl);
- JSONObject genericJsonObject = client.fetchMetricsJson(KruizeConstants.APIMessages.GET, "");
- JsonObject jsonObject = new Gson().fromJson(genericJsonObject.toString(), JsonObject.class);
- JsonArray resultArray = jsonObject.getAsJsonObject(KruizeConstants.JSONKeys.DATA).getAsJsonArray(KruizeConstants.DataSourceConstants.DataSourceQueryJSONKeys.RESULT);
- // Process fetched metrics
- if (null != resultArray && !resultArray.isEmpty()) {
- resultArray = jsonObject.getAsJsonObject(KruizeConstants.JSONKeys.DATA).getAsJsonArray(
- KruizeConstants.DataSourceConstants.DataSourceQueryJSONKeys.RESULT).get(0)
- .getAsJsonObject().getAsJsonArray(KruizeConstants.DataSourceConstants
- .DataSourceQueryJSONKeys.VALUES);
- sdf.setTimeZone(TimeZone.getTimeZone(KruizeConstants.TimeUnitsExt.TimeZones.UTC));
+ LOGGER.info(promQL);
+ String podMetricsUrl;
+ try {
+ podMetricsUrl = String.format(KruizeConstants.DataSourceConstants.DATASOURCE_ENDPOINT_WITH_QUERY,
+ dataSourceInfo.getUrl(),
+ URLEncoder.encode(promQL, CHARACTER_ENCODING),
+ interval_start_time_epoc,
+ interval_end_time_epoc,
+ measurementDurationMinutesInDouble.intValue() * KruizeConstants.TimeConv.NO_OF_SECONDS_PER_MINUTE);
+ LOGGER.info(podMetricsUrl);
+ client.setBaseURL(podMetricsUrl);
+ JSONObject genericJsonObject = client.fetchMetricsJson(KruizeConstants.APIMessages.GET, "");
+ JsonObject jsonObject = new Gson().fromJson(genericJsonObject.toString(), JsonObject.class);
+ JsonArray resultArray = jsonObject.getAsJsonObject(KruizeConstants.JSONKeys.DATA).getAsJsonArray(KruizeConstants.DataSourceConstants.DataSourceQueryJSONKeys.RESULT);
+
+ // Skipping if Result array is null or empty
+ if (null == resultArray || resultArray.isEmpty())
+ continue;
+ // Process fetched metrics
+ if (isAcceleratorMetric){
+ for (JsonElement result : resultArray) {
+ JsonObject resultObject = result.getAsJsonObject();
+ JsonObject metricObject = resultObject.getAsJsonObject(KruizeConstants.JSONKeys.METRIC);
+
+ // Set the data only for the container Accelerator device
+ if (null == metricObject.get(KruizeConstants.JSONKeys.MODEL_NAME).getAsString())
+ continue;
+ if (metricObject.get(KruizeConstants.JSONKeys.MODEL_NAME).getAsString().isEmpty())
+ continue;
+
+ ArrayList deviceDetails = containerData.getContainerDeviceList().getDevices(AnalyzerConstants.DeviceType.ACCELERATOR);
+ // Continuing to next element
+ // All other elements will also fail as there is no Accelerator attached
+ // Theoretically, it doesn't fail, but the future implementations may change
+ // So adding a check after a function call to check it's return value is advisable
+ // TODO: Needs a check to figure out why devicelist is empty if is Accelerator detected is true
+ if (null == deviceDetails)
+ continue;
+ if (deviceDetails.isEmpty())
+ continue;
+
+ // Assuming only one MIG supported Accelerator is attached
+ // Needs to be changed when you support multiple Accelerator's
+ // Same changes need to be applied at the time of adding the device in
+ // DeviceHandler
+ DeviceDetails deviceDetail = deviceDetails.get(0);
+ AcceleratorDeviceData containerAcceleratorDeviceData = (AcceleratorDeviceData) deviceDetail;
+
+ // Skip non-matching Accelerator entries
+ if (!metricObject.get(KruizeConstants.JSONKeys.MODEL_NAME).getAsString().equalsIgnoreCase(containerAcceleratorDeviceData.getModelName()))
+ continue;
+
+ AcceleratorDeviceData acceleratorDeviceData = new AcceleratorDeviceData(metricObject.get(KruizeConstants.JSONKeys.MODEL_NAME).getAsString(),
+ metricObject.get(KruizeConstants.JSONKeys.HOSTNAME).getAsString(),
+ metricObject.get(KruizeConstants.JSONKeys.UUID).getAsString(),
+ metricObject.get(KruizeConstants.JSONKeys.DEVICE).getAsString(),
+ true);
+
+ JsonArray valuesArray = resultObject.getAsJsonArray(KruizeConstants.DataSourceConstants
+ .DataSourceQueryJSONKeys.VALUES);
+ sdf.setTimeZone(TimeZone.getTimeZone(KruizeConstants.TimeUnitsExt.TimeZones.UTC));
// Iterate over fetched metrics
Timestamp sTime = new Timestamp(interval_start_time_epoc);
- for (JsonElement element : resultArray) {
+ for (JsonElement element : valuesArray) {
JsonArray valueArray = element.getAsJsonArray();
long epochTime = valueArray.get(0).getAsLong();
double value = valueArray.get(1).getAsDouble();
String timestamp = sdf.format(new Date(epochTime * KruizeConstants.TimeConv.NO_OF_MSECS_IN_SEC));
Date date = sdf.parse(timestamp);
- Timestamp eTime = new Timestamp(date.getTime());
+ Timestamp tempTime = new Timestamp(date.getTime());
+ Timestamp eTime = CommonUtils.getNearestTimestamp(containerDataResults,
+ tempTime,
+ AnalyzerConstants.AcceleratorConstants.AcceleratorMetricConstants.TIMESTAMP_RANGE_CHECK_IN_MINUTES);
+
+ // containerDataResults are empty so will use the prometheus timestamp
+ if (null == eTime) {
+ // eTime = tempTime;
+ // Skipping entry, as inconsistency with CPU & memory records may provide null pointer while accessing metric results
+ // TODO: Need to seperate the data records of CPU and memory based on exporter
+ // TODO: Perform recommendation generation by stitching the outcome
+ continue;
+ }
// Prepare interval results
- prepareIntervalResults(containerDataResults, intervalResults, resMap, metricResults,
- metricAggregationInfoResults, sTime, eTime, metricEntry, aggregationFunctionsEntry, value, format);
+ if (containerDataResults.containsKey(eTime)) {
+ intervalResults = containerDataResults.get(eTime);
+ acceleratorMetricResultHashMap = intervalResults.getAcceleratorMetricResultHashMap();
+ if (null == acceleratorMetricResultHashMap)
+ acceleratorMetricResultHashMap = new HashMap<>();
+ } else {
+ intervalResults = new IntervalResults();
+ acceleratorMetricResultHashMap = new HashMap<>();
+ }
+ AnalyzerConstants.MetricName metricName = AnalyzerConstants.MetricName.valueOf(metricEntry.getName());
+ if (acceleratorMetricResultHashMap.containsKey(metricName)) {
+ metricResults = acceleratorMetricResultHashMap.get(metricName).getMetricResults();
+ metricAggregationInfoResults = metricResults.getAggregationInfoResult();
+ } else {
+ metricResults = new MetricResults();
+ metricAggregationInfoResults = new MetricAggregationInfoResults();
+ }
+ Method method = MetricAggregationInfoResults.class.getDeclaredMethod(KruizeConstants.APIMessages.SET + aggregationFunctionsEntry.getKey().substring(0, 1).toUpperCase() + aggregationFunctionsEntry.getKey().substring(1), Double.class);
+ method.invoke(metricAggregationInfoResults, value);
+ metricAggregationInfoResults.setFormat(format);
+ metricResults.setAggregationInfoResult(metricAggregationInfoResults);
+ metricResults.setName(String.valueOf(metricName));
+ metricResults.setFormat(format);
+ AcceleratorMetricResult acceleratorMetricResult = new AcceleratorMetricResult(acceleratorDeviceData, metricResults);
+ acceleratorMetricResultHashMap.put(metricName, acceleratorMetricResult);
+ intervalResults.setAcceleratorMetricResultHashMap(acceleratorMetricResultHashMap);
+ intervalResults.setIntervalStartTime(sTime); //Todo this will change
+ intervalResults.setIntervalEndTime(eTime);
+ intervalResults.setDurationInMinutes((double) ((eTime.getTime() - sTime.getTime())
+ / ((long) KruizeConstants.TimeConv.NO_OF_SECONDS_PER_MINUTE
+ * KruizeConstants.TimeConv.NO_OF_MSECS_IN_SEC)));
+ containerDataResults.put(eTime, intervalResults);
+ sTime = eTime;
}
}
- } catch (Exception e) {
- throw new RuntimeException(e);
+ } else {
+ resultArray = jsonObject.getAsJsonObject(KruizeConstants.JSONKeys.DATA).getAsJsonArray(
+ KruizeConstants.DataSourceConstants.DataSourceQueryJSONKeys.RESULT).get(0)
+ .getAsJsonObject().getAsJsonArray(KruizeConstants.DataSourceConstants
+ .DataSourceQueryJSONKeys.VALUES);
+ sdf.setTimeZone(TimeZone.getTimeZone(KruizeConstants.TimeUnitsExt.TimeZones.UTC));
+
+ // Iterate over fetched metrics
+ Timestamp sTime = new Timestamp(interval_start_time_epoc);
+ for (JsonElement element : resultArray) {
+ JsonArray valueArray = element.getAsJsonArray();
+ long epochTime = valueArray.get(0).getAsLong();
+ double value = valueArray.get(1).getAsDouble();
+ String timestamp = sdf.format(new Date(epochTime * KruizeConstants.TimeConv.NO_OF_MSECS_IN_SEC));
+ Date date = sdf.parse(timestamp);
+ Timestamp eTime = new Timestamp(date.getTime());
+
+ // Prepare interval results
+ prepareIntervalResults(containerDataResults, intervalResults, resMap, metricResults,
+ metricAggregationInfoResults, sTime, eTime, metricEntry, aggregationFunctionsEntry, value, format);
+ }
}
+ } catch (Exception e) {
+ throw new RuntimeException(e);
}
}
}
diff --git a/src/main/java/com/autotune/analyzer/utils/AnalyzerConstants.java b/src/main/java/com/autotune/analyzer/utils/AnalyzerConstants.java
index 473dc2290..8a7571b7c 100644
--- a/src/main/java/com/autotune/analyzer/utils/AnalyzerConstants.java
+++ b/src/main/java/com/autotune/analyzer/utils/AnalyzerConstants.java
@@ -240,6 +240,14 @@ private AcceleratorConstants() {
}
+ public static final class AcceleratorMetricConstants {
+ private AcceleratorMetricConstants() {
+
+ }
+
+ public static final int TIMESTAMP_RANGE_CHECK_IN_MINUTES = 5;
+ }
+
public static final class SupportedAccelerators {
private SupportedAccelerators() {
diff --git a/src/main/java/com/autotune/common/utils/CommonUtils.java b/src/main/java/com/autotune/common/utils/CommonUtils.java
index 384bc5dc3..64dfebfb1 100644
--- a/src/main/java/com/autotune/common/utils/CommonUtils.java
+++ b/src/main/java/com/autotune/common/utils/CommonUtils.java
@@ -16,24 +16,44 @@
package com.autotune.common.utils;
+import com.autotune.analyzer.exceptions.FetchMetricsError;
+import com.autotune.common.data.system.info.device.ContainerDeviceList;
+import com.autotune.common.data.system.info.device.accelerator.AcceleratorDeviceData;
import com.autotune.common.datasource.DataSourceCollection;
import com.autotune.common.datasource.DataSourceInfo;
import com.autotune.common.datasource.DataSourceManager;
+import com.autotune.analyzer.recommendations.term.Terms;
+import com.autotune.analyzer.utils.AnalyzerConstants;
+import com.autotune.common.data.result.ContainerData;
+import com.autotune.common.data.result.IntervalResults;
+import com.autotune.utils.GenericRestApiClient;
import com.autotune.utils.KruizeConstants;
+import com.google.gson.*;
+import org.json.JSONObject;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import java.io.IOException;
+import java.net.URLEncoder;
+import java.security.KeyManagementException;
+import java.security.KeyStoreException;
+import java.security.NoSuchAlgorithmException;
import java.sql.Timestamp;
-import java.util.Calendar;
-import java.util.Collections;
-import java.util.List;
+import java.text.ParseException;
+import java.text.SimpleDateFormat;
+import java.util.*;
import java.util.concurrent.TimeUnit;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
+import static com.autotune.analyzer.utils.AnalyzerConstants.ServiceConstants.CHARACTER_ENCODING;
/**
* This Class holds the utilities needed by the classes in common package
*/
public class CommonUtils {
+ private static final Logger LOGGER = LoggerFactory.getLogger(CommonUtils.class);
+
/**
* AutotuneDatasourceTypes is an ENUM which holds different types of
* datasources supported by Autotune
@@ -315,4 +335,164 @@ public static boolean isInvalidDataSource(DataSourceInfo datasource) {
return datasource == null || datasource.getAuthenticationConfig() == null ||
datasource.getAuthenticationConfig().toString().isEmpty();
}
+
+ public static void markAcceleratorDeviceStatusToContainer (ContainerData containerData,
+ String maxDateQuery,
+ String namespace,
+ String workload,
+ String workload_type,
+ DataSourceInfo dataSourceInfo,
+ Map termsMap,
+ Double measurementDurationMinutesInDouble,
+ String gpuDetectionQuery)
+ throws IOException, NoSuchAlgorithmException, KeyStoreException,
+ KeyManagementException, ParseException, FetchMetricsError {
+
+ SimpleDateFormat sdf = new SimpleDateFormat(KruizeConstants.DateFormats.STANDARD_JSON_DATE_FORMAT, Locale.ROOT);
+ String containerName = containerData.getContainer_name();
+ String queryToEncode = null;
+ long interval_end_time_epoc = 0;
+ long interval_start_time_epoc = 0;
+
+ LOGGER.info("maxDateQuery: {}", maxDateQuery);
+ queryToEncode = maxDateQuery
+ .replace(AnalyzerConstants.NAMESPACE_VARIABLE, namespace)
+ .replace(AnalyzerConstants.CONTAINER_VARIABLE, containerName)
+ .replace(AnalyzerConstants.WORKLOAD_VARIABLE, workload)
+ .replace(AnalyzerConstants.WORKLOAD_TYPE_VARIABLE, workload_type);
+
+ String dateMetricsUrl = String.format(KruizeConstants.DataSourceConstants.DATE_ENDPOINT_WITH_QUERY,
+ dataSourceInfo.getUrl(),
+ URLEncoder.encode(queryToEncode, CHARACTER_ENCODING)
+ );
+
+ LOGGER.info(dateMetricsUrl);
+ GenericRestApiClient client = new GenericRestApiClient(dataSourceInfo);
+ client.setBaseURL(dateMetricsUrl);
+ JSONObject genericJsonObject = client.fetchMetricsJson(KruizeConstants.APIMessages.GET, "");
+ JsonObject jsonObject = new Gson().fromJson(genericJsonObject.toString(), JsonObject.class);
+ JsonArray resultArray = jsonObject.getAsJsonObject(KruizeConstants.JSONKeys.DATA).getAsJsonArray(KruizeConstants.DataSourceConstants.DataSourceQueryJSONKeys.RESULT);
+
+ if (null == resultArray || resultArray.isEmpty()) {
+ // Need to alert that container max duration is not detected
+ // Ignoring it here, as we take care of it at generate recommendations
+ return;
+ }
+
+ resultArray = resultArray.get(0)
+ .getAsJsonObject().getAsJsonArray(KruizeConstants.DataSourceConstants.DataSourceQueryJSONKeys.VALUE);
+ long epochTime = resultArray.get(0).getAsLong();
+ String timestamp = sdf.format(new Date(epochTime * KruizeConstants.TimeConv.NO_OF_MSECS_IN_SEC));
+ Date date = sdf.parse(timestamp);
+ Timestamp dateTS = new Timestamp(date.getTime());
+ interval_end_time_epoc = dateTS.getTime() / KruizeConstants.TimeConv.NO_OF_MSECS_IN_SEC
+ - ((long) dateTS.getTimezoneOffset() * KruizeConstants.TimeConv.NO_OF_SECONDS_PER_MINUTE);
+ int maxDay = Terms.getMaxDays(termsMap);
+ LOGGER.info(KruizeConstants.APIMessages.MAX_DAY, maxDay);
+ Timestamp startDateTS = Timestamp.valueOf(Objects.requireNonNull(dateTS).toLocalDateTime().minusDays(maxDay));
+ interval_start_time_epoc = startDateTS.getTime() / KruizeConstants.TimeConv.NO_OF_MSECS_IN_SEC
+ - ((long) startDateTS.getTimezoneOffset() * KruizeConstants.TimeConv.NO_OF_MSECS_IN_SEC);
+
+ gpuDetectionQuery = gpuDetectionQuery.replace(AnalyzerConstants.NAMESPACE_VARIABLE, namespace)
+ .replace(AnalyzerConstants.CONTAINER_VARIABLE, containerName)
+ .replace(AnalyzerConstants.MEASUREMENT_DURATION_IN_MIN_VARAIBLE, Integer.toString(measurementDurationMinutesInDouble.intValue()))
+ .replace(AnalyzerConstants.WORKLOAD_VARIABLE, workload)
+ .replace(AnalyzerConstants.WORKLOAD_TYPE_VARIABLE, workload_type);
+
+ String podMetricsUrl;
+ try {
+ podMetricsUrl = String.format(KruizeConstants.DataSourceConstants.DATASOURCE_ENDPOINT_WITH_QUERY,
+ dataSourceInfo.getUrl(),
+ URLEncoder.encode(gpuDetectionQuery, CHARACTER_ENCODING),
+ interval_start_time_epoc,
+ interval_end_time_epoc,
+ measurementDurationMinutesInDouble.intValue() * KruizeConstants.TimeConv.NO_OF_SECONDS_PER_MINUTE);
+ LOGGER.info(podMetricsUrl);
+ client.setBaseURL(podMetricsUrl);
+ genericJsonObject = client.fetchMetricsJson(KruizeConstants.APIMessages.GET, "");
+
+ jsonObject = new Gson().fromJson(genericJsonObject.toString(), JsonObject.class);
+ resultArray = jsonObject.getAsJsonObject(KruizeConstants.JSONKeys.DATA).getAsJsonArray(KruizeConstants.DataSourceConstants.DataSourceQueryJSONKeys.RESULT);
+
+ if (null != resultArray && !resultArray.isEmpty()) {
+ for (JsonElement result : resultArray) {
+ JsonObject resultObject = result.getAsJsonObject();
+ JsonArray valuesArray = resultObject.getAsJsonArray(KruizeConstants.DataSourceConstants
+ .DataSourceQueryJSONKeys.VALUES);
+
+ for (JsonElement element : valuesArray) {
+ JsonArray valueArray = element.getAsJsonArray();
+ double value = valueArray.get(1).getAsDouble();
+ // TODO: Check for non-zero values to mark as GPU workload
+ break;
+ }
+
+ JsonObject metricObject = resultObject.getAsJsonObject(KruizeConstants.JSONKeys.METRIC);
+ String modelName = metricObject.get(KruizeConstants.JSONKeys.MODEL_NAME).getAsString();
+ if (null == modelName)
+ continue;
+
+ boolean isSupportedMig = checkIfModelIsKruizeSupportedMIG(modelName);
+ if (isSupportedMig) {
+ AcceleratorDeviceData acceleratorDeviceData = new AcceleratorDeviceData(metricObject.get(KruizeConstants.JSONKeys.MODEL_NAME).getAsString(),
+ metricObject.get(KruizeConstants.JSONKeys.HOSTNAME).getAsString(),
+ metricObject.get(KruizeConstants.JSONKeys.UUID).getAsString(),
+ metricObject.get(KruizeConstants.JSONKeys.DEVICE).getAsString(),
+ isSupportedMig);
+
+
+ if (null == containerData.getContainerDeviceList()) {
+ ContainerDeviceList containerDeviceList = new ContainerDeviceList();
+ containerData.setContainerDeviceList(containerDeviceList);
+ }
+ containerData.getContainerDeviceList().addDevice(AnalyzerConstants.DeviceType.ACCELERATOR, acceleratorDeviceData);
+ // TODO: Currently we consider only the first mig supported GPU
+ return;
+ }
+ }
+ }
+ } catch (IOException | NoSuchAlgorithmException | KeyStoreException | KeyManagementException |
+ JsonSyntaxException e) {
+ throw new RuntimeException(e);
+ }
+ }
+
+ public static boolean checkIfModelIsKruizeSupportedMIG(String modelName) {
+ if (null == modelName || modelName.isEmpty())
+ return false;
+
+ modelName = modelName.toUpperCase();
+
+ boolean A100_CHECK = (modelName.contains("A100") &&
+ (modelName.contains("40GB") || modelName.contains("80GB")));
+
+ boolean H100_CHECK = false;
+
+ if (!A100_CHECK) {
+ H100_CHECK = (modelName.contains("H100") && modelName.contains("80GB"));
+ }
+
+ return A100_CHECK || H100_CHECK;
+ }
+
+ public static Timestamp getNearestTimestamp(HashMap containerDataResults, Timestamp targetTime, int minutesRange) {
+ long rangeInMillis = (long) minutesRange * 60 * 1000;
+ long targetTimeMillis = targetTime.getTime();
+
+ Timestamp nearestTimestamp = null;
+ long nearestDistance = Long.MAX_VALUE;
+
+ for (Map.Entry entry : containerDataResults.entrySet()) {
+ Timestamp currentTimestamp = entry.getKey();
+ long currentTimeMillis = currentTimestamp.getTime();
+ long distance = Math.abs(targetTimeMillis - currentTimeMillis);
+
+ if (distance <= rangeInMillis && distance < nearestDistance) {
+ nearestDistance = distance;
+ nearestTimestamp = currentTimestamp;
+ }
+ }
+
+ return nearestTimestamp;
+ }
}
diff --git a/src/main/java/com/autotune/utils/KruizeConstants.java b/src/main/java/com/autotune/utils/KruizeConstants.java
index 15779cdae..62dd08247 100644
--- a/src/main/java/com/autotune/utils/KruizeConstants.java
+++ b/src/main/java/com/autotune/utils/KruizeConstants.java
@@ -167,6 +167,7 @@ public static final class JSONKeys {
public static final String POD_METRICS = "pod_metrics";
public static final String CONTAINER_METRICS = "container_metrics";
public static final String METRICS = "metrics";
+ public static final String METRIC = "metric";
public static final String CONFIG = "config";
public static final String CURRENT = "current";
public static final String NAME = "name";
@@ -262,6 +263,10 @@ public static final class JSONKeys {
public static final String PLOTS_DATAPOINTS = "datapoints";
public static final String PLOTS_DATA = "plots_data";
public static final String CONFIDENCE_LEVEL = "confidence_level";
+ public static final String HOSTNAME = "Hostname";
+ public static final String UUID = "UUID";
+ public static final String DEVICE = "device";
+ public static final String MODEL_NAME = "modelName";
private JSONKeys() {
}
From d763d331a1232bb0754df23f879bc59f5b2ab350 Mon Sep 17 00:00:00 2001
From: bharathappali
Date: Tue, 8 Oct 2024 00:15:46 +0530
Subject: [PATCH 14/64] Change info to debug in logging
Signed-off-by: bharathappali
---
src/main/java/com/autotune/common/utils/CommonUtils.java | 8 ++++----
1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/src/main/java/com/autotune/common/utils/CommonUtils.java b/src/main/java/com/autotune/common/utils/CommonUtils.java
index 64dfebfb1..3c454fc8a 100644
--- a/src/main/java/com/autotune/common/utils/CommonUtils.java
+++ b/src/main/java/com/autotune/common/utils/CommonUtils.java
@@ -354,7 +354,7 @@ public static void markAcceleratorDeviceStatusToContainer (ContainerData contain
long interval_end_time_epoc = 0;
long interval_start_time_epoc = 0;
- LOGGER.info("maxDateQuery: {}", maxDateQuery);
+ LOGGER.debug("maxDateQuery: {}", maxDateQuery);
queryToEncode = maxDateQuery
.replace(AnalyzerConstants.NAMESPACE_VARIABLE, namespace)
.replace(AnalyzerConstants.CONTAINER_VARIABLE, containerName)
@@ -366,7 +366,7 @@ public static void markAcceleratorDeviceStatusToContainer (ContainerData contain
URLEncoder.encode(queryToEncode, CHARACTER_ENCODING)
);
- LOGGER.info(dateMetricsUrl);
+ LOGGER.debug(dateMetricsUrl);
GenericRestApiClient client = new GenericRestApiClient(dataSourceInfo);
client.setBaseURL(dateMetricsUrl);
JSONObject genericJsonObject = client.fetchMetricsJson(KruizeConstants.APIMessages.GET, "");
@@ -388,7 +388,7 @@ public static void markAcceleratorDeviceStatusToContainer (ContainerData contain
interval_end_time_epoc = dateTS.getTime() / KruizeConstants.TimeConv.NO_OF_MSECS_IN_SEC
- ((long) dateTS.getTimezoneOffset() * KruizeConstants.TimeConv.NO_OF_SECONDS_PER_MINUTE);
int maxDay = Terms.getMaxDays(termsMap);
- LOGGER.info(KruizeConstants.APIMessages.MAX_DAY, maxDay);
+ LOGGER.debug(KruizeConstants.APIMessages.MAX_DAY, maxDay);
Timestamp startDateTS = Timestamp.valueOf(Objects.requireNonNull(dateTS).toLocalDateTime().minusDays(maxDay));
interval_start_time_epoc = startDateTS.getTime() / KruizeConstants.TimeConv.NO_OF_MSECS_IN_SEC
- ((long) startDateTS.getTimezoneOffset() * KruizeConstants.TimeConv.NO_OF_MSECS_IN_SEC);
@@ -407,7 +407,7 @@ public static void markAcceleratorDeviceStatusToContainer (ContainerData contain
interval_start_time_epoc,
interval_end_time_epoc,
measurementDurationMinutesInDouble.intValue() * KruizeConstants.TimeConv.NO_OF_SECONDS_PER_MINUTE);
- LOGGER.info(podMetricsUrl);
+ LOGGER.debug(podMetricsUrl);
client.setBaseURL(podMetricsUrl);
genericJsonObject = client.fetchMetricsJson(KruizeConstants.APIMessages.GET, "");
From b92b4b235997fff99e4ce49970c4f5a91a306d14 Mon Sep 17 00:00:00 2001
From: bharathappali
Date: Tue, 8 Oct 2024 12:01:42 +0530
Subject: [PATCH 15/64] Move util functions from CommonUtils to
RecommendationUtils
Signed-off-by: bharathappali
---
.../engine/RecommendationEngine.java | 4 +-
.../utils/RecommendationUtils.java | 182 ++++++++++++++++++
.../autotune/common/utils/CommonUtils.java | 160 ---------------
3 files changed, 184 insertions(+), 162 deletions(-)
diff --git a/src/main/java/com/autotune/analyzer/recommendations/engine/RecommendationEngine.java b/src/main/java/com/autotune/analyzer/recommendations/engine/RecommendationEngine.java
index 81ad283b9..734eac5da 100644
--- a/src/main/java/com/autotune/analyzer/recommendations/engine/RecommendationEngine.java
+++ b/src/main/java/com/autotune/analyzer/recommendations/engine/RecommendationEngine.java
@@ -2020,7 +2020,7 @@ private void fetchContainerMetricsBasedOnDataSourceAndProfile(KruizeObject kruiz
// Check if the container data has Accelerator support else check for Accelerator metrics
if (null == containerData.getContainerDeviceList() || !containerData.getContainerDeviceList().isAcceleratorDeviceDetected()) {
- CommonUtils.markAcceleratorDeviceStatusToContainer(containerData,
+ RecommendationUtils.markAcceleratorDeviceStatusToContainer(containerData,
maxDateQuery,
namespace,
workload,
@@ -2213,7 +2213,7 @@ private void fetchContainerMetricsBasedOnDataSourceAndProfile(KruizeObject kruiz
String timestamp = sdf.format(new Date(epochTime * KruizeConstants.TimeConv.NO_OF_MSECS_IN_SEC));
Date date = sdf.parse(timestamp);
Timestamp tempTime = new Timestamp(date.getTime());
- Timestamp eTime = CommonUtils.getNearestTimestamp(containerDataResults,
+ Timestamp eTime = RecommendationUtils.getNearestTimestamp(containerDataResults,
tempTime,
AnalyzerConstants.AcceleratorConstants.AcceleratorMetricConstants.TIMESTAMP_RANGE_CHECK_IN_MINUTES);
diff --git a/src/main/java/com/autotune/analyzer/recommendations/utils/RecommendationUtils.java b/src/main/java/com/autotune/analyzer/recommendations/utils/RecommendationUtils.java
index 158935eed..685a438fa 100644
--- a/src/main/java/com/autotune/analyzer/recommendations/utils/RecommendationUtils.java
+++ b/src/main/java/com/autotune/analyzer/recommendations/utils/RecommendationUtils.java
@@ -1,15 +1,38 @@
package com.autotune.analyzer.recommendations.utils;
+import com.autotune.analyzer.exceptions.FetchMetricsError;
import com.autotune.analyzer.recommendations.RecommendationConfigItem;
import com.autotune.analyzer.recommendations.RecommendationConstants;
+import com.autotune.analyzer.recommendations.term.Terms;
import com.autotune.analyzer.utils.AnalyzerConstants;
import com.autotune.common.data.metrics.MetricResults;
+import com.autotune.common.data.result.ContainerData;
import com.autotune.common.data.result.IntervalResults;
+import com.autotune.common.data.system.info.device.ContainerDeviceList;
+import com.autotune.common.data.system.info.device.accelerator.AcceleratorDeviceData;
+import com.autotune.common.datasource.DataSourceInfo;
+import com.autotune.common.utils.CommonUtils;
+import com.autotune.utils.GenericRestApiClient;
+import com.autotune.utils.KruizeConstants;
+import com.google.gson.*;
+import org.json.JSONObject;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import java.io.IOException;
+import java.net.URLEncoder;
+import java.security.KeyManagementException;
+import java.security.KeyStoreException;
+import java.security.NoSuchAlgorithmException;
import java.sql.Timestamp;
+import java.text.ParseException;
+import java.text.SimpleDateFormat;
import java.util.*;
+import static com.autotune.analyzer.utils.AnalyzerConstants.ServiceConstants.CHARACTER_ENCODING;
+
public class RecommendationUtils {
+ private static final Logger LOGGER = LoggerFactory.getLogger(RecommendationUtils.class);
public static RecommendationConfigItem getCurrentValue(Map filteredResultsMap,
Timestamp timestampToExtract,
AnalyzerConstants.ResourceSetting resourceSetting,
@@ -129,6 +152,165 @@ else if (resourceSetting == AnalyzerConstants.ResourceSetting.limits) {
}
}
+ public static void markAcceleratorDeviceStatusToContainer (ContainerData containerData,
+ String maxDateQuery,
+ String namespace,
+ String workload,
+ String workload_type,
+ DataSourceInfo dataSourceInfo,
+ Map termsMap,
+ Double measurementDurationMinutesInDouble,
+ String gpuDetectionQuery)
+ throws IOException, NoSuchAlgorithmException, KeyStoreException,
+ KeyManagementException, ParseException, FetchMetricsError {
+
+ SimpleDateFormat sdf = new SimpleDateFormat(KruizeConstants.DateFormats.STANDARD_JSON_DATE_FORMAT, Locale.ROOT);
+ String containerName = containerData.getContainer_name();
+ String queryToEncode = null;
+ long interval_end_time_epoc = 0;
+ long interval_start_time_epoc = 0;
+
+ LOGGER.debug("maxDateQuery: {}", maxDateQuery);
+ queryToEncode = maxDateQuery
+ .replace(AnalyzerConstants.NAMESPACE_VARIABLE, namespace)
+ .replace(AnalyzerConstants.CONTAINER_VARIABLE, containerName)
+ .replace(AnalyzerConstants.WORKLOAD_VARIABLE, workload)
+ .replace(AnalyzerConstants.WORKLOAD_TYPE_VARIABLE, workload_type);
+
+ String dateMetricsUrl = String.format(KruizeConstants.DataSourceConstants.DATE_ENDPOINT_WITH_QUERY,
+ dataSourceInfo.getUrl(),
+ URLEncoder.encode(queryToEncode, CHARACTER_ENCODING)
+ );
+
+ LOGGER.debug(dateMetricsUrl);
+ GenericRestApiClient client = new GenericRestApiClient(dataSourceInfo);
+ client.setBaseURL(dateMetricsUrl);
+ JSONObject genericJsonObject = client.fetchMetricsJson(KruizeConstants.APIMessages.GET, "");
+ JsonObject jsonObject = new Gson().fromJson(genericJsonObject.toString(), JsonObject.class);
+ JsonArray resultArray = jsonObject.getAsJsonObject(KruizeConstants.JSONKeys.DATA).getAsJsonArray(KruizeConstants.DataSourceConstants.DataSourceQueryJSONKeys.RESULT);
+
+ if (null == resultArray || resultArray.isEmpty()) {
+ // Need to alert that container max duration is not detected
+ // Ignoring it here, as we take care of it at generate recommendations
+ return;
+ }
+
+ resultArray = resultArray.get(0)
+ .getAsJsonObject().getAsJsonArray(KruizeConstants.DataSourceConstants.DataSourceQueryJSONKeys.VALUE);
+ long epochTime = resultArray.get(0).getAsLong();
+ String timestamp = sdf.format(new Date(epochTime * KruizeConstants.TimeConv.NO_OF_MSECS_IN_SEC));
+ Date date = sdf.parse(timestamp);
+ Timestamp dateTS = new Timestamp(date.getTime());
+ interval_end_time_epoc = dateTS.getTime() / KruizeConstants.TimeConv.NO_OF_MSECS_IN_SEC
+ - ((long) dateTS.getTimezoneOffset() * KruizeConstants.TimeConv.NO_OF_SECONDS_PER_MINUTE);
+ int maxDay = Terms.getMaxDays(termsMap);
+ LOGGER.debug(KruizeConstants.APIMessages.MAX_DAY, maxDay);
+ Timestamp startDateTS = Timestamp.valueOf(Objects.requireNonNull(dateTS).toLocalDateTime().minusDays(maxDay));
+ interval_start_time_epoc = startDateTS.getTime() / KruizeConstants.TimeConv.NO_OF_MSECS_IN_SEC
+ - ((long) startDateTS.getTimezoneOffset() * KruizeConstants.TimeConv.NO_OF_MSECS_IN_SEC);
+
+ gpuDetectionQuery = gpuDetectionQuery.replace(AnalyzerConstants.NAMESPACE_VARIABLE, namespace)
+ .replace(AnalyzerConstants.CONTAINER_VARIABLE, containerName)
+ .replace(AnalyzerConstants.MEASUREMENT_DURATION_IN_MIN_VARAIBLE, Integer.toString(measurementDurationMinutesInDouble.intValue()))
+ .replace(AnalyzerConstants.WORKLOAD_VARIABLE, workload)
+ .replace(AnalyzerConstants.WORKLOAD_TYPE_VARIABLE, workload_type);
+
+ String podMetricsUrl;
+ try {
+ podMetricsUrl = String.format(KruizeConstants.DataSourceConstants.DATASOURCE_ENDPOINT_WITH_QUERY,
+ dataSourceInfo.getUrl(),
+ URLEncoder.encode(gpuDetectionQuery, CHARACTER_ENCODING),
+ interval_start_time_epoc,
+ interval_end_time_epoc,
+ measurementDurationMinutesInDouble.intValue() * KruizeConstants.TimeConv.NO_OF_SECONDS_PER_MINUTE);
+ LOGGER.debug(podMetricsUrl);
+ client.setBaseURL(podMetricsUrl);
+ genericJsonObject = client.fetchMetricsJson(KruizeConstants.APIMessages.GET, "");
+
+ jsonObject = new Gson().fromJson(genericJsonObject.toString(), JsonObject.class);
+ resultArray = jsonObject.getAsJsonObject(KruizeConstants.JSONKeys.DATA).getAsJsonArray(KruizeConstants.DataSourceConstants.DataSourceQueryJSONKeys.RESULT);
+
+ if (null != resultArray && !resultArray.isEmpty()) {
+ for (JsonElement result : resultArray) {
+ JsonObject resultObject = result.getAsJsonObject();
+ JsonArray valuesArray = resultObject.getAsJsonArray(KruizeConstants.DataSourceConstants
+ .DataSourceQueryJSONKeys.VALUES);
+
+ for (JsonElement element : valuesArray) {
+ JsonArray valueArray = element.getAsJsonArray();
+ double value = valueArray.get(1).getAsDouble();
+ // TODO: Check for non-zero values to mark as GPU workload
+ break;
+ }
+
+ JsonObject metricObject = resultObject.getAsJsonObject(KruizeConstants.JSONKeys.METRIC);
+ String modelName = metricObject.get(KruizeConstants.JSONKeys.MODEL_NAME).getAsString();
+ if (null == modelName)
+ continue;
+
+ boolean isSupportedMig = checkIfModelIsKruizeSupportedMIG(modelName);
+ if (isSupportedMig) {
+ AcceleratorDeviceData acceleratorDeviceData = new AcceleratorDeviceData(metricObject.get(KruizeConstants.JSONKeys.MODEL_NAME).getAsString(),
+ metricObject.get(KruizeConstants.JSONKeys.HOSTNAME).getAsString(),
+ metricObject.get(KruizeConstants.JSONKeys.UUID).getAsString(),
+ metricObject.get(KruizeConstants.JSONKeys.DEVICE).getAsString(),
+ isSupportedMig);
+
+
+ if (null == containerData.getContainerDeviceList()) {
+ ContainerDeviceList containerDeviceList = new ContainerDeviceList();
+ containerData.setContainerDeviceList(containerDeviceList);
+ }
+ containerData.getContainerDeviceList().addDevice(AnalyzerConstants.DeviceType.ACCELERATOR, acceleratorDeviceData);
+ // TODO: Currently we consider only the first mig supported GPU
+ return;
+ }
+ }
+ }
+ } catch (IOException | NoSuchAlgorithmException | KeyStoreException | KeyManagementException |
+ JsonSyntaxException e) {
+ throw new RuntimeException(e);
+ }
+ }
+
+ public static boolean checkIfModelIsKruizeSupportedMIG(String modelName) {
+ if (null == modelName || modelName.isEmpty())
+ return false;
+
+ modelName = modelName.toUpperCase();
+
+ boolean A100_CHECK = (modelName.contains("A100") &&
+ (modelName.contains("40GB") || modelName.contains("80GB")));
+
+ boolean H100_CHECK = false;
+
+ if (!A100_CHECK) {
+ H100_CHECK = (modelName.contains("H100") && modelName.contains("80GB"));
+ }
+
+ return A100_CHECK || H100_CHECK;
+ }
+
+ public static Timestamp getNearestTimestamp(HashMap containerDataResults, Timestamp targetTime, int minutesRange) {
+ long rangeInMillis = (long) minutesRange * 60 * 1000;
+ long targetTimeMillis = targetTime.getTime();
+
+ Timestamp nearestTimestamp = null;
+ long nearestDistance = Long.MAX_VALUE;
+
+ for (Map.Entry entry : containerDataResults.entrySet()) {
+ Timestamp currentTimestamp = entry.getKey();
+ long currentTimeMillis = currentTimestamp.getTime();
+ long distance = Math.abs(targetTimeMillis - currentTimeMillis);
+
+ if (distance <= rangeInMillis && distance < nearestDistance) {
+ nearestDistance = distance;
+ nearestTimestamp = currentTimestamp;
+ }
+ }
+
+ return nearestTimestamp;
+ }
}
diff --git a/src/main/java/com/autotune/common/utils/CommonUtils.java b/src/main/java/com/autotune/common/utils/CommonUtils.java
index 3c454fc8a..582c127e8 100644
--- a/src/main/java/com/autotune/common/utils/CommonUtils.java
+++ b/src/main/java/com/autotune/common/utils/CommonUtils.java
@@ -335,164 +335,4 @@ public static boolean isInvalidDataSource(DataSourceInfo datasource) {
return datasource == null || datasource.getAuthenticationConfig() == null ||
datasource.getAuthenticationConfig().toString().isEmpty();
}
-
- public static void markAcceleratorDeviceStatusToContainer (ContainerData containerData,
- String maxDateQuery,
- String namespace,
- String workload,
- String workload_type,
- DataSourceInfo dataSourceInfo,
- Map termsMap,
- Double measurementDurationMinutesInDouble,
- String gpuDetectionQuery)
- throws IOException, NoSuchAlgorithmException, KeyStoreException,
- KeyManagementException, ParseException, FetchMetricsError {
-
- SimpleDateFormat sdf = new SimpleDateFormat(KruizeConstants.DateFormats.STANDARD_JSON_DATE_FORMAT, Locale.ROOT);
- String containerName = containerData.getContainer_name();
- String queryToEncode = null;
- long interval_end_time_epoc = 0;
- long interval_start_time_epoc = 0;
-
- LOGGER.debug("maxDateQuery: {}", maxDateQuery);
- queryToEncode = maxDateQuery
- .replace(AnalyzerConstants.NAMESPACE_VARIABLE, namespace)
- .replace(AnalyzerConstants.CONTAINER_VARIABLE, containerName)
- .replace(AnalyzerConstants.WORKLOAD_VARIABLE, workload)
- .replace(AnalyzerConstants.WORKLOAD_TYPE_VARIABLE, workload_type);
-
- String dateMetricsUrl = String.format(KruizeConstants.DataSourceConstants.DATE_ENDPOINT_WITH_QUERY,
- dataSourceInfo.getUrl(),
- URLEncoder.encode(queryToEncode, CHARACTER_ENCODING)
- );
-
- LOGGER.debug(dateMetricsUrl);
- GenericRestApiClient client = new GenericRestApiClient(dataSourceInfo);
- client.setBaseURL(dateMetricsUrl);
- JSONObject genericJsonObject = client.fetchMetricsJson(KruizeConstants.APIMessages.GET, "");
- JsonObject jsonObject = new Gson().fromJson(genericJsonObject.toString(), JsonObject.class);
- JsonArray resultArray = jsonObject.getAsJsonObject(KruizeConstants.JSONKeys.DATA).getAsJsonArray(KruizeConstants.DataSourceConstants.DataSourceQueryJSONKeys.RESULT);
-
- if (null == resultArray || resultArray.isEmpty()) {
- // Need to alert that container max duration is not detected
- // Ignoring it here, as we take care of it at generate recommendations
- return;
- }
-
- resultArray = resultArray.get(0)
- .getAsJsonObject().getAsJsonArray(KruizeConstants.DataSourceConstants.DataSourceQueryJSONKeys.VALUE);
- long epochTime = resultArray.get(0).getAsLong();
- String timestamp = sdf.format(new Date(epochTime * KruizeConstants.TimeConv.NO_OF_MSECS_IN_SEC));
- Date date = sdf.parse(timestamp);
- Timestamp dateTS = new Timestamp(date.getTime());
- interval_end_time_epoc = dateTS.getTime() / KruizeConstants.TimeConv.NO_OF_MSECS_IN_SEC
- - ((long) dateTS.getTimezoneOffset() * KruizeConstants.TimeConv.NO_OF_SECONDS_PER_MINUTE);
- int maxDay = Terms.getMaxDays(termsMap);
- LOGGER.debug(KruizeConstants.APIMessages.MAX_DAY, maxDay);
- Timestamp startDateTS = Timestamp.valueOf(Objects.requireNonNull(dateTS).toLocalDateTime().minusDays(maxDay));
- interval_start_time_epoc = startDateTS.getTime() / KruizeConstants.TimeConv.NO_OF_MSECS_IN_SEC
- - ((long) startDateTS.getTimezoneOffset() * KruizeConstants.TimeConv.NO_OF_MSECS_IN_SEC);
-
- gpuDetectionQuery = gpuDetectionQuery.replace(AnalyzerConstants.NAMESPACE_VARIABLE, namespace)
- .replace(AnalyzerConstants.CONTAINER_VARIABLE, containerName)
- .replace(AnalyzerConstants.MEASUREMENT_DURATION_IN_MIN_VARAIBLE, Integer.toString(measurementDurationMinutesInDouble.intValue()))
- .replace(AnalyzerConstants.WORKLOAD_VARIABLE, workload)
- .replace(AnalyzerConstants.WORKLOAD_TYPE_VARIABLE, workload_type);
-
- String podMetricsUrl;
- try {
- podMetricsUrl = String.format(KruizeConstants.DataSourceConstants.DATASOURCE_ENDPOINT_WITH_QUERY,
- dataSourceInfo.getUrl(),
- URLEncoder.encode(gpuDetectionQuery, CHARACTER_ENCODING),
- interval_start_time_epoc,
- interval_end_time_epoc,
- measurementDurationMinutesInDouble.intValue() * KruizeConstants.TimeConv.NO_OF_SECONDS_PER_MINUTE);
- LOGGER.debug(podMetricsUrl);
- client.setBaseURL(podMetricsUrl);
- genericJsonObject = client.fetchMetricsJson(KruizeConstants.APIMessages.GET, "");
-
- jsonObject = new Gson().fromJson(genericJsonObject.toString(), JsonObject.class);
- resultArray = jsonObject.getAsJsonObject(KruizeConstants.JSONKeys.DATA).getAsJsonArray(KruizeConstants.DataSourceConstants.DataSourceQueryJSONKeys.RESULT);
-
- if (null != resultArray && !resultArray.isEmpty()) {
- for (JsonElement result : resultArray) {
- JsonObject resultObject = result.getAsJsonObject();
- JsonArray valuesArray = resultObject.getAsJsonArray(KruizeConstants.DataSourceConstants
- .DataSourceQueryJSONKeys.VALUES);
-
- for (JsonElement element : valuesArray) {
- JsonArray valueArray = element.getAsJsonArray();
- double value = valueArray.get(1).getAsDouble();
- // TODO: Check for non-zero values to mark as GPU workload
- break;
- }
-
- JsonObject metricObject = resultObject.getAsJsonObject(KruizeConstants.JSONKeys.METRIC);
- String modelName = metricObject.get(KruizeConstants.JSONKeys.MODEL_NAME).getAsString();
- if (null == modelName)
- continue;
-
- boolean isSupportedMig = checkIfModelIsKruizeSupportedMIG(modelName);
- if (isSupportedMig) {
- AcceleratorDeviceData acceleratorDeviceData = new AcceleratorDeviceData(metricObject.get(KruizeConstants.JSONKeys.MODEL_NAME).getAsString(),
- metricObject.get(KruizeConstants.JSONKeys.HOSTNAME).getAsString(),
- metricObject.get(KruizeConstants.JSONKeys.UUID).getAsString(),
- metricObject.get(KruizeConstants.JSONKeys.DEVICE).getAsString(),
- isSupportedMig);
-
-
- if (null == containerData.getContainerDeviceList()) {
- ContainerDeviceList containerDeviceList = new ContainerDeviceList();
- containerData.setContainerDeviceList(containerDeviceList);
- }
- containerData.getContainerDeviceList().addDevice(AnalyzerConstants.DeviceType.ACCELERATOR, acceleratorDeviceData);
- // TODO: Currently we consider only the first mig supported GPU
- return;
- }
- }
- }
- } catch (IOException | NoSuchAlgorithmException | KeyStoreException | KeyManagementException |
- JsonSyntaxException e) {
- throw new RuntimeException(e);
- }
- }
-
- public static boolean checkIfModelIsKruizeSupportedMIG(String modelName) {
- if (null == modelName || modelName.isEmpty())
- return false;
-
- modelName = modelName.toUpperCase();
-
- boolean A100_CHECK = (modelName.contains("A100") &&
- (modelName.contains("40GB") || modelName.contains("80GB")));
-
- boolean H100_CHECK = false;
-
- if (!A100_CHECK) {
- H100_CHECK = (modelName.contains("H100") && modelName.contains("80GB"));
- }
-
- return A100_CHECK || H100_CHECK;
- }
-
- public static Timestamp getNearestTimestamp(HashMap containerDataResults, Timestamp targetTime, int minutesRange) {
- long rangeInMillis = (long) minutesRange * 60 * 1000;
- long targetTimeMillis = targetTime.getTime();
-
- Timestamp nearestTimestamp = null;
- long nearestDistance = Long.MAX_VALUE;
-
- for (Map.Entry entry : containerDataResults.entrySet()) {
- Timestamp currentTimestamp = entry.getKey();
- long currentTimeMillis = currentTimestamp.getTime();
- long distance = Math.abs(targetTimeMillis - currentTimeMillis);
-
- if (distance <= rangeInMillis && distance < nearestDistance) {
- nearestDistance = distance;
- nearestTimestamp = currentTimestamp;
- }
- }
-
- return nearestTimestamp;
- }
}
From c6418e5891e5f0f65eaa1afbb35bb8e2b8127202 Mon Sep 17 00:00:00 2001
From: bharathappali
Date: Fri, 4 Oct 2024 10:35:28 +0530
Subject: [PATCH 16/64] Add Recommendation Logic Fix rebase issue Update the
GPU perf percentile to 98
Signed-off-by: bharathappali
---
.../RecommendationConstants.java | 3 +
.../engine/RecommendationEngine.java | 18 ++++-
.../model/CostBasedRecommendationModel.java | 81 +++++++++++++++++++
.../PerformanceBasedRecommendationModel.java | 76 ++++++++++++++++-
.../model/RecommendationModel.java | 3 +
.../utils/RecommendationUtils.java | 42 +++++++++-
.../analyzer/utils/AnalyzerConstants.java | 2 +-
.../metadata/AcceleratorMetaDataService.java | 4 +-
8 files changed, 220 insertions(+), 9 deletions(-)
diff --git a/src/main/java/com/autotune/analyzer/recommendations/RecommendationConstants.java b/src/main/java/com/autotune/analyzer/recommendations/RecommendationConstants.java
index 4cc4be488..3a203c4af 100644
--- a/src/main/java/com/autotune/analyzer/recommendations/RecommendationConstants.java
+++ b/src/main/java/com/autotune/analyzer/recommendations/RecommendationConstants.java
@@ -738,6 +738,9 @@ public static class PercentileConstants {
public static final Integer TWENTYFIVE_PERCENTILE = 25;
public static final Integer SEVENTYFIVE_PERCENTILE = 75;
public static final Integer FIFTY_PERCENTILE = 50;
+ public static final Integer COST_ACCELERATOR_PERCENTILE = 60;
+ public static final Integer PERFORMANCE_ACCELERATOR_PERCENTILE = 98;
+
}
}
}
diff --git a/src/main/java/com/autotune/analyzer/recommendations/engine/RecommendationEngine.java b/src/main/java/com/autotune/analyzer/recommendations/engine/RecommendationEngine.java
index 734eac5da..92c15f097 100644
--- a/src/main/java/com/autotune/analyzer/recommendations/engine/RecommendationEngine.java
+++ b/src/main/java/com/autotune/analyzer/recommendations/engine/RecommendationEngine.java
@@ -698,6 +698,7 @@ private MappedRecommendationForModel generateRecommendationBasedOnModel(Timestam
// Get the Recommendation Items
RecommendationConfigItem recommendationCpuRequest = model.getCPURequestRecommendation(filteredResultsMap, notifications);
RecommendationConfigItem recommendationMemRequest = model.getMemoryRequestRecommendation(filteredResultsMap, notifications);
+ Map recommendationAcceleratorRequestMap = model.getAcceleratorRequestRecommendation(filteredResultsMap, notifications);
// Get the Recommendation Items
// Calling requests on limits as we are maintaining limits and requests as same
@@ -728,7 +729,8 @@ private MappedRecommendationForModel generateRecommendationBasedOnModel(Timestam
internalMapToPopulate,
numPods,
cpuThreshold,
- memoryThreshold
+ memoryThreshold,
+ recommendationAcceleratorRequestMap
);
} else {
RecommendationNotification notification = new RecommendationNotification(
@@ -1077,7 +1079,8 @@ private MappedRecommendationForModel generateNamespaceRecommendationBasedOnModel
internalMapToPopulate,
numPodsInNamespace,
namespaceCpuThreshold,
- namespaceMemoryThreshold
+ namespaceMemoryThreshold,
+ null
);
} else {
RecommendationNotification notification = new RecommendationNotification(
@@ -1100,13 +1103,17 @@ private MappedRecommendationForModel generateNamespaceRecommendationBasedOnModel
* @param numPods The number of pods to consider for the recommendation.
* @param cpuThreshold The CPU usage threshold for the recommendation.
* @param memoryThreshold The memory usage threshold for the recommendation.
+ * @param recommendationAcceleratorRequestMap The Map which has Accelerator recommendations
* @return {@code true} if the internal map was successfully populated; {@code false} otherwise.
*/
private boolean populateRecommendation(Map.Entry termEntry,
MappedRecommendationForModel recommendationModel,
ArrayList notifications,
HashMap internalMapToPopulate,
- int numPods, double cpuThreshold, double memoryThreshold) {
+ int numPods,
+ double cpuThreshold,
+ double memoryThreshold,
+ Map recommendationAcceleratorRequestMap) {
// Check for cpu & memory Thresholds (Duplicate check if the caller is generateRecommendations)
String recommendationTerm = termEntry.getKey();
double hours = termEntry.getValue().getDays() * KruizeConstants.TimeConv.NO_OF_HOURS_PER_DAY * KruizeConstants.TimeConv.
@@ -1690,6 +1697,11 @@ private boolean populateRecommendation(Map.Entry termEntry,
config.put(AnalyzerConstants.ResourceSetting.requests, requestsMap);
}
+ // Check if accelerator map is not empty and add to limits map
+ if (null != recommendationAcceleratorRequestMap && !recommendationAcceleratorRequestMap.isEmpty()) {
+ limitsMap.putAll(recommendationAcceleratorRequestMap);
+ }
+
// Set Limits Map
if (!limitsMap.isEmpty()) {
config.put(AnalyzerConstants.ResourceSetting.limits, limitsMap);
diff --git a/src/main/java/com/autotune/analyzer/recommendations/model/CostBasedRecommendationModel.java b/src/main/java/com/autotune/analyzer/recommendations/model/CostBasedRecommendationModel.java
index db8c783ae..c27d7a445 100644
--- a/src/main/java/com/autotune/analyzer/recommendations/model/CostBasedRecommendationModel.java
+++ b/src/main/java/com/autotune/analyzer/recommendations/model/CostBasedRecommendationModel.java
@@ -3,16 +3,21 @@
import com.autotune.analyzer.recommendations.RecommendationConfigItem;
import com.autotune.analyzer.recommendations.RecommendationConstants;
import com.autotune.analyzer.recommendations.RecommendationNotification;
+import com.autotune.analyzer.recommendations.utils.RecommendationUtils;
import com.autotune.analyzer.utils.AnalyzerConstants;
+import com.autotune.common.data.metrics.AcceleratorMetricResult;
import com.autotune.common.data.metrics.MetricAggregationInfoResults;
import com.autotune.common.data.metrics.MetricResults;
import com.autotune.common.data.result.IntervalResults;
+import com.autotune.common.data.system.info.device.accelerator.metadata.AcceleratorMetaDataService;
+import com.autotune.common.data.system.info.device.accelerator.metadata.AcceleratorProfile;
import com.autotune.common.utils.CommonUtils;
import com.autotune.utils.KruizeConstants;
import org.json.JSONArray;
import org.json.JSONObject;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
+import software.amazon.awssdk.services.cloudwatchlogs.endpoints.internal.Value;
import java.sql.Timestamp;
import java.util.*;
@@ -22,6 +27,8 @@
import static com.autotune.analyzer.recommendations.RecommendationConstants.RecommendationEngine.PercentileConstants.COST_CPU_PERCENTILE;
import static com.autotune.analyzer.recommendations.RecommendationConstants.RecommendationEngine.PercentileConstants.COST_MEMORY_PERCENTILE;
+import static com.autotune.analyzer.recommendations.RecommendationConstants.RecommendationEngine.PercentileConstants.COST_ACCELERATOR_PERCENTILE;
+
import static com.autotune.analyzer.recommendations.RecommendationConstants.RecommendationValueConstants.*;
public class CostBasedRecommendationModel implements RecommendationModel {
@@ -505,6 +512,80 @@ public RecommendationConfigItem getMemoryRequestRecommendationForNamespace(Map getAcceleratorRequestRecommendation (
+ Map filteredResultsMap,
+ ArrayList notifications
+ ) {
+ List acceleratorCoreMaxValues = new ArrayList<>();
+ List acceleratorMemoryMaxValues = new ArrayList<>();
+
+ boolean isGpuWorkload = false;
+ String acceleratorModel = null;
+
+ for (Map.Entry entry : filteredResultsMap.entrySet()) {
+ IntervalResults intervalResults = entry.getValue();
+
+ // Skip if accelerator map is null
+ if (null == intervalResults.getAcceleratorMetricResultHashMap())
+ continue;
+
+ isGpuWorkload = true;
+ for (Map.Entry gpuEntry : intervalResults.getAcceleratorMetricResultHashMap().entrySet()) {
+ AcceleratorMetricResult gpuMetricResult = gpuEntry.getValue();
+
+ // Set Accelerator name
+ // TODO: Need to handle separate processing in case of container supporting multiple accelerators
+ if (null == acceleratorModel
+ && null != gpuMetricResult.getAcceleratorDeviceData().getModelName()
+ && !gpuMetricResult.getAcceleratorDeviceData().getModelName().isEmpty()
+ && CommonUtils.checkIfModelIsKruizeSupportedMIG(gpuMetricResult.getAcceleratorDeviceData().getModelName())
+ ) {
+ String obtainedAcceleratorName = CommonUtils.getSupportedModelBasedOnModelName(gpuMetricResult.getAcceleratorDeviceData().getModelName());
+ if (null != obtainedAcceleratorName)
+ acceleratorModel = obtainedAcceleratorName;
+ }
+
+ MetricResults metricResults = gpuMetricResult.getMetricResults();
+
+ // Skip if metric results is null
+ if (null == metricResults || null == metricResults.getAggregationInfoResult())
+ continue;
+
+ MetricAggregationInfoResults aggregationInfo = metricResults.getAggregationInfoResult();
+
+ // Skip if max is null or zero or negative
+ if (null == aggregationInfo.getMax() || aggregationInfo.getMax() <= 0.0)
+ continue;
+
+ boolean isCoreUsage = gpuEntry.getKey() == AnalyzerConstants.MetricName.gpuCoreUsage;
+ boolean isMemoryUsage = gpuEntry.getKey() == AnalyzerConstants.MetricName.gpuMemoryUsage;
+
+ // Skip if it's none of the Accelerator metrics
+ if (!isCoreUsage && !isMemoryUsage)
+ continue;
+
+ if (isCoreUsage) {
+ acceleratorCoreMaxValues.add(aggregationInfo.getMax());
+ } else {
+ acceleratorMemoryMaxValues.add(aggregationInfo.getMax());
+ }
+ }
+ }
+
+ if (!isGpuWorkload) {
+ return null;
+ }
+
+ double coreAverage = CommonUtils.percentile(COST_ACCELERATOR_PERCENTILE, acceleratorCoreMaxValues);
+ double memoryAverage = CommonUtils.percentile(COST_ACCELERATOR_PERCENTILE, acceleratorMemoryMaxValues);
+
+ double coreFraction = coreAverage / 100;
+ double memoryFraction = memoryAverage / 100;
+
+ return RecommendationUtils.getMapWithOptimalProfile(acceleratorModel, coreFraction, memoryFraction);
+ }
+
public static JSONObject calculateNamespaceMemoryUsage(IntervalResults intervalResults) {
// create a JSON object which should be returned here having two values, Math.max and Collections.Min
JSONObject jsonObject = new JSONObject();
diff --git a/src/main/java/com/autotune/analyzer/recommendations/model/PerformanceBasedRecommendationModel.java b/src/main/java/com/autotune/analyzer/recommendations/model/PerformanceBasedRecommendationModel.java
index fcaccd344..dffc83f93 100644
--- a/src/main/java/com/autotune/analyzer/recommendations/model/PerformanceBasedRecommendationModel.java
+++ b/src/main/java/com/autotune/analyzer/recommendations/model/PerformanceBasedRecommendationModel.java
@@ -3,8 +3,10 @@
import com.autotune.analyzer.recommendations.RecommendationConfigItem;
import com.autotune.analyzer.recommendations.RecommendationConstants;
import com.autotune.analyzer.recommendations.RecommendationNotification;
+import com.autotune.analyzer.recommendations.utils.RecommendationUtils;
import com.autotune.analyzer.services.UpdateRecommendations;
import com.autotune.analyzer.utils.AnalyzerConstants;
+import com.autotune.common.data.metrics.AcceleratorMetricResult;
import com.autotune.common.data.metrics.MetricAggregationInfoResults;
import com.autotune.common.data.metrics.MetricResults;
import com.autotune.common.data.result.IntervalResults;
@@ -19,8 +21,8 @@
import java.util.*;
import java.util.stream.Collectors;
-import static com.autotune.analyzer.recommendations.RecommendationConstants.RecommendationEngine.PercentileConstants.PERFORMANCE_CPU_PERCENTILE;
-import static com.autotune.analyzer.recommendations.RecommendationConstants.RecommendationEngine.PercentileConstants.PERFORMANCE_MEMORY_PERCENTILE;
+import static com.autotune.analyzer.recommendations.RecommendationConstants.RecommendationEngine.PercentileConstants.*;
+import static com.autotune.analyzer.recommendations.RecommendationConstants.RecommendationEngine.PercentileConstants.PERFORMANCE_ACCELERATOR_PERCENTILE;
import static com.autotune.analyzer.recommendations.RecommendationConstants.RecommendationValueConstants.*;
public class PerformanceBasedRecommendationModel implements RecommendationModel {
@@ -372,6 +374,76 @@ public RecommendationConfigItem getMemoryRequestRecommendationForNamespace(Map getAcceleratorRequestRecommendation(Map filteredResultsMap, ArrayList notifications) {
+ List acceleratorCoreMaxValues = new ArrayList<>();
+ List acceleratorMemoryMaxValues = new ArrayList<>();
+
+ boolean isGpuWorkload = false;
+ String acceleratorModel = null;
+
+ for (Map.Entry entry : filteredResultsMap.entrySet()) {
+ IntervalResults intervalResults = entry.getValue();
+
+ // Skip if accelerator map is null
+ if (null == intervalResults.getAcceleratorMetricResultHashMap())
+ continue;
+
+ isGpuWorkload = true;
+ for (Map.Entry gpuEntry : intervalResults.getAcceleratorMetricResultHashMap().entrySet()) {
+ AcceleratorMetricResult gpuMetricResult = gpuEntry.getValue();
+
+ // Set Accelerator name
+ if (null == acceleratorModel
+ && null != gpuMetricResult.getAcceleratorDeviceData().getModelName()
+ && !gpuMetricResult.getAcceleratorDeviceData().getModelName().isEmpty()
+ && CommonUtils.checkIfModelIsKruizeSupportedMIG(gpuMetricResult.getAcceleratorDeviceData().getModelName())
+ ) {
+ String obtainedAcceleratorName = CommonUtils.getSupportedModelBasedOnModelName(gpuMetricResult.getAcceleratorDeviceData().getModelName());
+ if (null != obtainedAcceleratorName)
+ acceleratorModel = obtainedAcceleratorName;
+ }
+
+ MetricResults metricResults = gpuMetricResult.getMetricResults();
+
+ // Skip if metric results is null
+ if (null == metricResults || null == metricResults.getAggregationInfoResult())
+ continue;
+
+ MetricAggregationInfoResults aggregationInfo = metricResults.getAggregationInfoResult();
+
+ // Skip if max is null or zero or negative
+ if (null == aggregationInfo.getMax() || aggregationInfo.getMax() <= 0.0)
+ continue;
+
+ boolean isCoreUsage = gpuEntry.getKey() == AnalyzerConstants.MetricName.gpuCoreUsage;
+ boolean isMemoryUsage = gpuEntry.getKey() == AnalyzerConstants.MetricName.gpuMemoryUsage;
+
+ // Skip if it's none of the Accelerator metrics
+ if (!isCoreUsage && !isMemoryUsage)
+ continue;
+
+ if (isCoreUsage) {
+ acceleratorCoreMaxValues.add(aggregationInfo.getMax());
+ } else {
+ acceleratorMemoryMaxValues.add(aggregationInfo.getMax());
+ }
+ }
+ }
+
+ if (!isGpuWorkload) {
+ return null;
+ }
+
+ double coreAverage = CommonUtils.percentile(PERFORMANCE_ACCELERATOR_PERCENTILE, acceleratorCoreMaxValues);
+ double memoryAverage = CommonUtils.percentile(PERFORMANCE_ACCELERATOR_PERCENTILE, acceleratorMemoryMaxValues);
+
+ double coreFraction = coreAverage / 100;
+ double memoryFraction = memoryAverage / 100;
+
+ return RecommendationUtils.getMapWithOptimalProfile(acceleratorModel, coreFraction, memoryFraction);
+ }
+
@Override
public String getModelName() {
return this.name;
diff --git a/src/main/java/com/autotune/analyzer/recommendations/model/RecommendationModel.java b/src/main/java/com/autotune/analyzer/recommendations/model/RecommendationModel.java
index 5a905805b..923ac0d20 100644
--- a/src/main/java/com/autotune/analyzer/recommendations/model/RecommendationModel.java
+++ b/src/main/java/com/autotune/analyzer/recommendations/model/RecommendationModel.java
@@ -2,6 +2,7 @@
import com.autotune.analyzer.recommendations.RecommendationConfigItem;
import com.autotune.analyzer.recommendations.RecommendationNotification;
+import com.autotune.analyzer.utils.AnalyzerConstants;
import com.autotune.common.data.result.IntervalResults;
import java.sql.Timestamp;
@@ -17,6 +18,8 @@ public interface RecommendationModel {
// get namespace recommendations for Memory Request
RecommendationConfigItem getMemoryRequestRecommendationForNamespace(Map filteredResultsMap, ArrayList notifications);
+ Map getAcceleratorRequestRecommendation(Map filteredResultsMap, ArrayList notifications);
+
public String getModelName();
void validate();
diff --git a/src/main/java/com/autotune/analyzer/recommendations/utils/RecommendationUtils.java b/src/main/java/com/autotune/analyzer/recommendations/utils/RecommendationUtils.java
index 685a438fa..64592a35d 100644
--- a/src/main/java/com/autotune/analyzer/recommendations/utils/RecommendationUtils.java
+++ b/src/main/java/com/autotune/analyzer/recommendations/utils/RecommendationUtils.java
@@ -11,13 +11,14 @@
import com.autotune.common.data.system.info.device.ContainerDeviceList;
import com.autotune.common.data.system.info.device.accelerator.AcceleratorDeviceData;
import com.autotune.common.datasource.DataSourceInfo;
-import com.autotune.common.utils.CommonUtils;
import com.autotune.utils.GenericRestApiClient;
import com.autotune.utils.KruizeConstants;
import com.google.gson.*;
import org.json.JSONObject;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
+import com.autotune.common.data.system.info.device.accelerator.metadata.AcceleratorMetaDataService;
+import com.autotune.common.data.system.info.device.accelerator.metadata.AcceleratorProfile;
import java.io.IOException;
import java.net.URLEncoder;
@@ -312,5 +313,44 @@ public static Timestamp getNearestTimestamp(HashMap
return nearestTimestamp;
}
+ public static HashMap getMapWithOptimalProfile(
+ String acceleratorModel,
+ Double coreFraction,
+ Double memoryFraction
+ ) {
+ if (null == acceleratorModel || null == coreFraction || null == memoryFraction)
+ return null;
+
+ HashMap returnMap = new HashMap<>();
+
+ AcceleratorMetaDataService gpuMetaDataService = AcceleratorMetaDataService.getInstance();
+ AcceleratorProfile acceleratorProfile = gpuMetaDataService.getAcceleratorProfile(acceleratorModel, coreFraction, memoryFraction);
+ RecommendationConfigItem recommendationConfigItem = new RecommendationConfigItem(1.0, "cores");
+
+ if (acceleratorProfile.getProfileName().equalsIgnoreCase(AnalyzerConstants.AcceleratorConstants.AcceleratorProfiles.PROFILE_1G_5GB)) {
+ returnMap.put(AnalyzerConstants.RecommendationItem.NVIDIA_GPU_PARTITION_1_CORE_5GB, recommendationConfigItem);
+ } else if (acceleratorProfile.getProfileName().equalsIgnoreCase(AnalyzerConstants.AcceleratorConstants.AcceleratorProfiles.PROFILE_1G_10GB)) {
+ returnMap.put(AnalyzerConstants.RecommendationItem.NVIDIA_GPU_PARTITION_1_CORE_10GB, recommendationConfigItem);
+ } else if (acceleratorProfile.getProfileName().equalsIgnoreCase(AnalyzerConstants.AcceleratorConstants.AcceleratorProfiles.PROFILE_1G_20GB)) {
+ returnMap.put(AnalyzerConstants.RecommendationItem.NVIDIA_GPU_PARTITION_1_CORE_20GB, recommendationConfigItem);
+ } else if (acceleratorProfile.getProfileName().equalsIgnoreCase(AnalyzerConstants.AcceleratorConstants.AcceleratorProfiles.PROFILE_2G_10GB)) {
+ returnMap.put(AnalyzerConstants.RecommendationItem.NVIDIA_GPU_PARTITION_2_CORES_10GB, recommendationConfigItem);
+ } else if (acceleratorProfile.getProfileName().equalsIgnoreCase(AnalyzerConstants.AcceleratorConstants.AcceleratorProfiles.PROFILE_2G_20GB)) {
+ returnMap.put(AnalyzerConstants.RecommendationItem.NVIDIA_GPU_PARTITION_2_CORES_20GB, recommendationConfigItem);
+ } else if (acceleratorProfile.getProfileName().equalsIgnoreCase(AnalyzerConstants.AcceleratorConstants.AcceleratorProfiles.PROFILE_3G_20GB)) {
+ returnMap.put(AnalyzerConstants.RecommendationItem.NVIDIA_GPU_PARTITION_3_CORES_20GB, recommendationConfigItem);
+ } else if (acceleratorProfile.getProfileName().equalsIgnoreCase(AnalyzerConstants.AcceleratorConstants.AcceleratorProfiles.PROFILE_3G_40GB)) {
+ returnMap.put(AnalyzerConstants.RecommendationItem.NVIDIA_GPU_PARTITION_3_CORES_40GB, recommendationConfigItem);
+ } else if (acceleratorProfile.getProfileName().equalsIgnoreCase(AnalyzerConstants.AcceleratorConstants.AcceleratorProfiles.PROFILE_4G_20GB)) {
+ returnMap.put(AnalyzerConstants.RecommendationItem.NVIDIA_GPU_PARTITION_4_CORES_20GB, recommendationConfigItem);
+ } else if (acceleratorProfile.getProfileName().equalsIgnoreCase(AnalyzerConstants.AcceleratorConstants.AcceleratorProfiles.PROFILE_4G_40GB)) {
+ returnMap.put(AnalyzerConstants.RecommendationItem.NVIDIA_GPU_PARTITION_4_CORES_40GB, recommendationConfigItem);
+ } else if (acceleratorProfile.getProfileName().equalsIgnoreCase(AnalyzerConstants.AcceleratorConstants.AcceleratorProfiles.PROFILE_7G_40GB)) {
+ returnMap.put(AnalyzerConstants.RecommendationItem.NVIDIA_GPU_PARTITION_7_CORES_40GB, recommendationConfigItem);
+ } else if (acceleratorProfile.getProfileName().equalsIgnoreCase(AnalyzerConstants.AcceleratorConstants.AcceleratorProfiles.PROFILE_7G_80GB)) {
+ returnMap.put(AnalyzerConstants.RecommendationItem.NVIDIA_GPU_PARTITION_7_CORES_80GB, recommendationConfigItem);
+ }
+ return returnMap;
+ }
}
diff --git a/src/main/java/com/autotune/analyzer/utils/AnalyzerConstants.java b/src/main/java/com/autotune/analyzer/utils/AnalyzerConstants.java
index 8a7571b7c..740bb859a 100644
--- a/src/main/java/com/autotune/analyzer/utils/AnalyzerConstants.java
+++ b/src/main/java/com/autotune/analyzer/utils/AnalyzerConstants.java
@@ -254,7 +254,7 @@ private SupportedAccelerators() {
}
public static final String A100_80_GB = "A100-80GB";
public static final String A100_40_GB = "A100-40GB";
- public static final String H100 = "H100";
+ public static final String H100_80_GB = "H100-80GB";
}
public static final class AcceleratorProfiles {
diff --git a/src/main/java/com/autotune/common/data/system/info/device/accelerator/metadata/AcceleratorMetaDataService.java b/src/main/java/com/autotune/common/data/system/info/device/accelerator/metadata/AcceleratorMetaDataService.java
index 58d43d686..6a5fd8187 100644
--- a/src/main/java/com/autotune/common/data/system/info/device/accelerator/metadata/AcceleratorMetaDataService.java
+++ b/src/main/java/com/autotune/common/data/system/info/device/accelerator/metadata/AcceleratorMetaDataService.java
@@ -63,7 +63,7 @@ private static void initializeAcceleratorProfiles() {
1.0, 1.0, 1));
acceleratorProfilesMap.put(AnalyzerConstants.AcceleratorConstants.SupportedAccelerators.A100_80_GB, new ArrayList<>(commonProfiles));
- acceleratorProfilesMap.put(AnalyzerConstants.AcceleratorConstants.SupportedAccelerators.H100, new ArrayList<>(commonProfiles));
+ acceleratorProfilesMap.put(AnalyzerConstants.AcceleratorConstants.SupportedAccelerators.H100_80_GB, new ArrayList<>(commonProfiles));
acceleratorProfilesMap.put(AnalyzerConstants.AcceleratorConstants.SupportedAccelerators.A100_40_GB, new ArrayList<>(a100_40_gb_profiles));
}
@@ -84,7 +84,7 @@ public AcceleratorProfile getAcceleratorProfile(String modelName, Double require
}
modelName = modelName.strip();
if (!modelName.equalsIgnoreCase(AnalyzerConstants.AcceleratorConstants.SupportedAccelerators.A100_80_GB)
- && !modelName.equalsIgnoreCase(AnalyzerConstants.AcceleratorConstants.SupportedAccelerators.H100)
+ && !modelName.equalsIgnoreCase(AnalyzerConstants.AcceleratorConstants.SupportedAccelerators.H100_80_GB)
&& !modelName.equalsIgnoreCase(AnalyzerConstants.AcceleratorConstants.SupportedAccelerators.A100_40_GB)) {
return null;
}
From 67fb865a279841dcc9f8d9ecc1a6190ab070d5be Mon Sep 17 00:00:00 2001
From: bharathappali
Date: Fri, 4 Oct 2024 10:35:28 +0530
Subject: [PATCH 17/64] Add recommendation generation logic
Signed-off-by: bharathappali
---
.../RecommendationConstants.java | 1 -
.../model/CostBasedRecommendationModel.java | 4 ++--
.../PerformanceBasedRecommendationModel.java | 4 ++--
.../utils/RecommendationUtils.java | 18 ++++++++++++++++++
.../autotune/common/utils/CommonUtils.java | 19 +------------------
5 files changed, 23 insertions(+), 23 deletions(-)
diff --git a/src/main/java/com/autotune/analyzer/recommendations/RecommendationConstants.java b/src/main/java/com/autotune/analyzer/recommendations/RecommendationConstants.java
index 3a203c4af..d708331e9 100644
--- a/src/main/java/com/autotune/analyzer/recommendations/RecommendationConstants.java
+++ b/src/main/java/com/autotune/analyzer/recommendations/RecommendationConstants.java
@@ -740,7 +740,6 @@ public static class PercentileConstants {
public static final Integer FIFTY_PERCENTILE = 50;
public static final Integer COST_ACCELERATOR_PERCENTILE = 60;
public static final Integer PERFORMANCE_ACCELERATOR_PERCENTILE = 98;
-
}
}
}
diff --git a/src/main/java/com/autotune/analyzer/recommendations/model/CostBasedRecommendationModel.java b/src/main/java/com/autotune/analyzer/recommendations/model/CostBasedRecommendationModel.java
index c27d7a445..891168f4f 100644
--- a/src/main/java/com/autotune/analyzer/recommendations/model/CostBasedRecommendationModel.java
+++ b/src/main/java/com/autotune/analyzer/recommendations/model/CostBasedRecommendationModel.java
@@ -539,9 +539,9 @@ public Map getAc
if (null == acceleratorModel
&& null != gpuMetricResult.getAcceleratorDeviceData().getModelName()
&& !gpuMetricResult.getAcceleratorDeviceData().getModelName().isEmpty()
- && CommonUtils.checkIfModelIsKruizeSupportedMIG(gpuMetricResult.getAcceleratorDeviceData().getModelName())
+ && RecommendationUtils.checkIfModelIsKruizeSupportedMIG(gpuMetricResult.getAcceleratorDeviceData().getModelName())
) {
- String obtainedAcceleratorName = CommonUtils.getSupportedModelBasedOnModelName(gpuMetricResult.getAcceleratorDeviceData().getModelName());
+ String obtainedAcceleratorName = RecommendationUtils.getSupportedModelBasedOnModelName(gpuMetricResult.getAcceleratorDeviceData().getModelName());
if (null != obtainedAcceleratorName)
acceleratorModel = obtainedAcceleratorName;
}
diff --git a/src/main/java/com/autotune/analyzer/recommendations/model/PerformanceBasedRecommendationModel.java b/src/main/java/com/autotune/analyzer/recommendations/model/PerformanceBasedRecommendationModel.java
index dffc83f93..247bd93be 100644
--- a/src/main/java/com/autotune/analyzer/recommendations/model/PerformanceBasedRecommendationModel.java
+++ b/src/main/java/com/autotune/analyzer/recommendations/model/PerformanceBasedRecommendationModel.java
@@ -397,9 +397,9 @@ public Map getAc
if (null == acceleratorModel
&& null != gpuMetricResult.getAcceleratorDeviceData().getModelName()
&& !gpuMetricResult.getAcceleratorDeviceData().getModelName().isEmpty()
- && CommonUtils.checkIfModelIsKruizeSupportedMIG(gpuMetricResult.getAcceleratorDeviceData().getModelName())
+ && RecommendationUtils.checkIfModelIsKruizeSupportedMIG(gpuMetricResult.getAcceleratorDeviceData().getModelName())
) {
- String obtainedAcceleratorName = CommonUtils.getSupportedModelBasedOnModelName(gpuMetricResult.getAcceleratorDeviceData().getModelName());
+ String obtainedAcceleratorName = RecommendationUtils.getSupportedModelBasedOnModelName(gpuMetricResult.getAcceleratorDeviceData().getModelName());
if (null != obtainedAcceleratorName)
acceleratorModel = obtainedAcceleratorName;
}
diff --git a/src/main/java/com/autotune/analyzer/recommendations/utils/RecommendationUtils.java b/src/main/java/com/autotune/analyzer/recommendations/utils/RecommendationUtils.java
index 64592a35d..45085f33c 100644
--- a/src/main/java/com/autotune/analyzer/recommendations/utils/RecommendationUtils.java
+++ b/src/main/java/com/autotune/analyzer/recommendations/utils/RecommendationUtils.java
@@ -352,5 +352,23 @@ public static HashMap
Date: Fri, 4 Oct 2024 11:28:36 +0530
Subject: [PATCH 18/64] Add Adapter for RecommendationItem to serialize and
deserialize the enum
Signed-off-by: bharathappali
---
.../adapters/RecommendationItemAdapter.java | 26 +++++++++++++++++++
.../exceptions/KruizeErrorHandler.java | 3 +++
.../analyzer/services/DSMetadataService.java | 3 +++
.../services/GenerateRecommendations.java | 2 ++
.../analyzer/services/ListDatasources.java | 2 ++
.../analyzer/services/ListExperiments.java | 2 ++
.../services/ListRecommendations.java | 2 ++
.../services/ListSupportedK8sObjects.java | 2 ++
.../services/MetricProfileService.java | 2 ++
.../services/PerformanceProfileService.java | 2 ++
.../services/UpdateRecommendations.java | 3 +++
.../analyzer/services/UpdateResults.java | 2 ++
.../autotune/database/helper/DBHelpers.java | 11 +++++++-
src/main/java/com/autotune/utils/Utils.java | 2 ++
14 files changed, 63 insertions(+), 1 deletion(-)
create mode 100644 src/main/java/com/autotune/analyzer/adapters/RecommendationItemAdapter.java
diff --git a/src/main/java/com/autotune/analyzer/adapters/RecommendationItemAdapter.java b/src/main/java/com/autotune/analyzer/adapters/RecommendationItemAdapter.java
new file mode 100644
index 000000000..7f7806e3c
--- /dev/null
+++ b/src/main/java/com/autotune/analyzer/adapters/RecommendationItemAdapter.java
@@ -0,0 +1,26 @@
+
+package com.autotune.analyzer.adapters;
+
+import com.autotune.analyzer.utils.AnalyzerConstants;
+import com.google.gson.*;
+
+import java.lang.reflect.Type;
+
+public class RecommendationItemAdapter implements JsonSerializer, JsonDeserializer {
+ @Override
+ public JsonElement serialize(AnalyzerConstants.RecommendationItem recommendationItem, Type type, JsonSerializationContext jsonSerializationContext) {
+ return jsonSerializationContext.serialize(recommendationItem.toString());
+ }
+
+
+ @Override
+ public AnalyzerConstants.RecommendationItem deserialize(JsonElement jsonElement, Type type, JsonDeserializationContext jsonDeserializationContext) throws JsonParseException {
+ String value = jsonElement.getAsString();
+ for (AnalyzerConstants.RecommendationItem item : AnalyzerConstants.RecommendationItem.values()) {
+ if (item.toString().equals(value)) {
+ return item;
+ }
+ }
+ throw new JsonParseException("Unknown element " + value);
+ }
+}
\ No newline at end of file
diff --git a/src/main/java/com/autotune/analyzer/exceptions/KruizeErrorHandler.java b/src/main/java/com/autotune/analyzer/exceptions/KruizeErrorHandler.java
index 0f7de32a8..1de629485 100644
--- a/src/main/java/com/autotune/analyzer/exceptions/KruizeErrorHandler.java
+++ b/src/main/java/com/autotune/analyzer/exceptions/KruizeErrorHandler.java
@@ -15,7 +15,9 @@
*******************************************************************************/
package com.autotune.analyzer.exceptions;
+import com.autotune.analyzer.adapters.RecommendationItemAdapter;
import com.autotune.analyzer.serviceObjects.FailedUpdateResultsAPIObject;
+import com.autotune.analyzer.utils.AnalyzerConstants;
import com.autotune.analyzer.utils.GsonUTCDateAdapter;
import com.google.gson.Gson;
import com.google.gson.GsonBuilder;
@@ -56,6 +58,7 @@ public void handle(String target, Request baseRequest, HttpServletRequest reques
.disableHtmlEscaping()
.enableComplexMapKeySerialization()
.registerTypeAdapter(Date.class, new GsonUTCDateAdapter())
+ .registerTypeAdapter(AnalyzerConstants.RecommendationItem.class, new RecommendationItemAdapter())
.create();
String gsonStr = gsonObj.toJson(new KruizeResponse(origMessage, errorCode, "", "ERROR", myList));
diff --git a/src/main/java/com/autotune/analyzer/services/DSMetadataService.java b/src/main/java/com/autotune/analyzer/services/DSMetadataService.java
index 904ada6ad..c3db126ad 100644
--- a/src/main/java/com/autotune/analyzer/services/DSMetadataService.java
+++ b/src/main/java/com/autotune/analyzer/services/DSMetadataService.java
@@ -16,6 +16,7 @@
package com.autotune.analyzer.services;
+import com.autotune.analyzer.adapters.RecommendationItemAdapter;
import com.autotune.analyzer.exceptions.KruizeResponse;
import com.autotune.analyzer.serviceObjects.DSMetadataAPIObject;
import com.autotune.analyzer.utils.AnalyzerConstants;
@@ -240,6 +241,7 @@ private void sendSuccessResponse(HttpServletResponse response, DataSourceMetadat
.setPrettyPrinting()
.enableComplexMapKeySerialization()
.registerTypeAdapter(Date.class, new GsonUTCDateAdapter())
+ .registerTypeAdapter(AnalyzerConstants.RecommendationItem.class, new RecommendationItemAdapter())
.create();
gsonStr = gsonObj.toJson(dataSourceMetadata);
}
@@ -416,6 +418,7 @@ private Gson createGsonObject() {
.setPrettyPrinting()
.enableComplexMapKeySerialization()
.registerTypeAdapter(Date.class, new GsonUTCDateAdapter())
+ .registerTypeAdapter(AnalyzerConstants.RecommendationItem.class, new RecommendationItemAdapter())
.create();
}
private boolean isValidBooleanValue(String value) {
diff --git a/src/main/java/com/autotune/analyzer/services/GenerateRecommendations.java b/src/main/java/com/autotune/analyzer/services/GenerateRecommendations.java
index 64d05fe9c..0b7d183fa 100644
--- a/src/main/java/com/autotune/analyzer/services/GenerateRecommendations.java
+++ b/src/main/java/com/autotune/analyzer/services/GenerateRecommendations.java
@@ -15,6 +15,7 @@
*******************************************************************************/
package com.autotune.analyzer.services;
+import com.autotune.analyzer.adapters.RecommendationItemAdapter;
import com.autotune.analyzer.exceptions.FetchMetricsError;
import com.autotune.analyzer.kruizeObject.KruizeObject;
import com.autotune.analyzer.recommendations.engine.RecommendationEngine;
@@ -171,6 +172,7 @@ public boolean shouldSkipClass(Class> clazz) {
.setPrettyPrinting()
.enableComplexMapKeySerialization()
.registerTypeAdapter(Date.class, new GsonUTCDateAdapter())
+ .registerTypeAdapter(AnalyzerConstants.RecommendationItem.class, new RecommendationItemAdapter())
.setExclusionStrategies(strategy)
.create();
gsonStr = gsonObj.toJson(recommendationList);
diff --git a/src/main/java/com/autotune/analyzer/services/ListDatasources.java b/src/main/java/com/autotune/analyzer/services/ListDatasources.java
index 1af77454d..2e870edbb 100644
--- a/src/main/java/com/autotune/analyzer/services/ListDatasources.java
+++ b/src/main/java/com/autotune/analyzer/services/ListDatasources.java
@@ -16,6 +16,7 @@
package com.autotune.analyzer.services;
+import com.autotune.analyzer.adapters.RecommendationItemAdapter;
import com.autotune.analyzer.serviceObjects.ListDatasourcesAPIObject;
import com.autotune.analyzer.utils.AnalyzerConstants;
import com.autotune.analyzer.utils.AnalyzerErrorConstants;
@@ -148,6 +149,7 @@ private Gson createGsonObject() {
.setPrettyPrinting()
.enableComplexMapKeySerialization()
.registerTypeAdapter(Date.class, new GsonUTCDateAdapter())
+ .registerTypeAdapter(AnalyzerConstants.RecommendationItem.class, new RecommendationItemAdapter())
.create();
}
diff --git a/src/main/java/com/autotune/analyzer/services/ListExperiments.java b/src/main/java/com/autotune/analyzer/services/ListExperiments.java
index e71d5e96f..be23a1596 100644
--- a/src/main/java/com/autotune/analyzer/services/ListExperiments.java
+++ b/src/main/java/com/autotune/analyzer/services/ListExperiments.java
@@ -16,6 +16,7 @@
package com.autotune.analyzer.services;
+import com.autotune.analyzer.adapters.RecommendationItemAdapter;
import com.autotune.analyzer.experiment.KruizeExperiment;
import com.autotune.analyzer.kruizeObject.KruizeObject;
import com.autotune.analyzer.serviceObjects.ContainerAPIObject;
@@ -281,6 +282,7 @@ private Gson createGsonObject() {
.setPrettyPrinting()
.enableComplexMapKeySerialization()
.registerTypeAdapter(Date.class, new GsonUTCDateAdapter())
+ .registerTypeAdapter(AnalyzerConstants.RecommendationItem.class, new RecommendationItemAdapter())
.setExclusionStrategies(new ExclusionStrategy() {
@Override
public boolean shouldSkipField(FieldAttributes f) {
diff --git a/src/main/java/com/autotune/analyzer/services/ListRecommendations.java b/src/main/java/com/autotune/analyzer/services/ListRecommendations.java
index 69bcca37c..c0dcfa942 100644
--- a/src/main/java/com/autotune/analyzer/services/ListRecommendations.java
+++ b/src/main/java/com/autotune/analyzer/services/ListRecommendations.java
@@ -16,6 +16,7 @@
package com.autotune.analyzer.services;
+import com.autotune.analyzer.adapters.RecommendationItemAdapter;
import com.autotune.analyzer.exceptions.KruizeResponse;
import com.autotune.analyzer.kruizeObject.KruizeObject;
import com.autotune.analyzer.serviceObjects.ContainerAPIObject;
@@ -224,6 +225,7 @@ public boolean shouldSkipClass(Class> clazz) {
.setPrettyPrinting()
.enableComplexMapKeySerialization()
.registerTypeAdapter(Date.class, new GsonUTCDateAdapter())
+ .registerTypeAdapter(AnalyzerConstants.RecommendationItem.class, new RecommendationItemAdapter())
.setExclusionStrategies(strategy)
.create();
gsonStr = gsonObj.toJson(recommendationList);
diff --git a/src/main/java/com/autotune/analyzer/services/ListSupportedK8sObjects.java b/src/main/java/com/autotune/analyzer/services/ListSupportedK8sObjects.java
index 1ac7dc39d..c67a2659d 100644
--- a/src/main/java/com/autotune/analyzer/services/ListSupportedK8sObjects.java
+++ b/src/main/java/com/autotune/analyzer/services/ListSupportedK8sObjects.java
@@ -15,6 +15,7 @@
*******************************************************************************/
package com.autotune.analyzer.services;
+import com.autotune.analyzer.adapters.RecommendationItemAdapter;
import com.autotune.analyzer.serviceObjects.ListSupportedK8sObjectsSO;
import com.autotune.analyzer.utils.GsonUTCDateAdapter;
import com.autotune.analyzer.utils.AnalyzerConstants;
@@ -57,6 +58,7 @@ protected void doPost(HttpServletRequest request, HttpServletResponse response)
.setPrettyPrinting()
.enableComplexMapKeySerialization()
.registerTypeAdapter(Date.class, new GsonUTCDateAdapter())
+ .registerTypeAdapter(AnalyzerConstants.RecommendationItem.class, new RecommendationItemAdapter())
.create();
// Convert the Service object to JSON
responseGSONString = gsonObj.toJson(listSupportedK8sObjectsSO);
diff --git a/src/main/java/com/autotune/analyzer/services/MetricProfileService.java b/src/main/java/com/autotune/analyzer/services/MetricProfileService.java
index d4311d07d..6f92e186e 100644
--- a/src/main/java/com/autotune/analyzer/services/MetricProfileService.java
+++ b/src/main/java/com/autotune/analyzer/services/MetricProfileService.java
@@ -16,6 +16,7 @@
package com.autotune.analyzer.services;
+import com.autotune.analyzer.adapters.RecommendationItemAdapter;
import com.autotune.analyzer.exceptions.InvalidValueException;
import com.autotune.analyzer.exceptions.PerformanceProfileResponse;
import com.autotune.analyzer.performanceProfiles.MetricProfileCollection;
@@ -378,6 +379,7 @@ private Gson createGsonObject() {
.setPrettyPrinting()
.enableComplexMapKeySerialization()
.registerTypeAdapter(Date.class, new GsonUTCDateAdapter())
+ .registerTypeAdapter(AnalyzerConstants.RecommendationItem.class, new RecommendationItemAdapter())
// a custom serializer for serializing metadata of JsonNode type.
.registerTypeAdapter(JsonNode.class, new JsonSerializer() {
@Override
diff --git a/src/main/java/com/autotune/analyzer/services/PerformanceProfileService.java b/src/main/java/com/autotune/analyzer/services/PerformanceProfileService.java
index 71be6267e..3ea3a207f 100644
--- a/src/main/java/com/autotune/analyzer/services/PerformanceProfileService.java
+++ b/src/main/java/com/autotune/analyzer/services/PerformanceProfileService.java
@@ -16,6 +16,7 @@
package com.autotune.analyzer.services;
+import com.autotune.analyzer.adapters.RecommendationItemAdapter;
import com.autotune.analyzer.exceptions.InvalidValueException;
import com.autotune.analyzer.exceptions.PerformanceProfileResponse;
import com.autotune.analyzer.performanceProfiles.PerformanceProfile;
@@ -130,6 +131,7 @@ protected void doGet(HttpServletRequest req, HttpServletResponse response) throw
.setPrettyPrinting()
.enableComplexMapKeySerialization()
.registerTypeAdapter(Date.class, new GsonUTCDateAdapter())
+ .registerTypeAdapter(AnalyzerConstants.RecommendationItem.class, new RecommendationItemAdapter())
.setExclusionStrategies(new ExclusionStrategy() {
@Override
public boolean shouldSkipField(FieldAttributes f) {
diff --git a/src/main/java/com/autotune/analyzer/services/UpdateRecommendations.java b/src/main/java/com/autotune/analyzer/services/UpdateRecommendations.java
index 903378655..45218d07b 100644
--- a/src/main/java/com/autotune/analyzer/services/UpdateRecommendations.java
+++ b/src/main/java/com/autotune/analyzer/services/UpdateRecommendations.java
@@ -15,12 +15,14 @@
*******************************************************************************/
package com.autotune.analyzer.services;
+import com.autotune.analyzer.adapters.RecommendationItemAdapter;
import com.autotune.analyzer.exceptions.FetchMetricsError;
import com.autotune.analyzer.kruizeObject.KruizeObject;
import com.autotune.analyzer.recommendations.engine.RecommendationEngine;
import com.autotune.analyzer.serviceObjects.ContainerAPIObject;
import com.autotune.analyzer.serviceObjects.Converters;
import com.autotune.analyzer.serviceObjects.ListRecommendationsAPIObject;
+import com.autotune.analyzer.utils.AnalyzerConstants;
import com.autotune.analyzer.utils.AnalyzerErrorConstants;
import com.autotune.analyzer.utils.GsonUTCDateAdapter;
import com.autotune.common.data.result.ContainerData;
@@ -168,6 +170,7 @@ public boolean shouldSkipClass(Class> clazz) {
.setPrettyPrinting()
.enableComplexMapKeySerialization()
.registerTypeAdapter(Date.class, new GsonUTCDateAdapter())
+ .registerTypeAdapter(AnalyzerConstants.RecommendationItem.class, new RecommendationItemAdapter())
.setExclusionStrategies(strategy)
.create();
gsonStr = gsonObj.toJson(recommendationList);
diff --git a/src/main/java/com/autotune/analyzer/services/UpdateResults.java b/src/main/java/com/autotune/analyzer/services/UpdateResults.java
index 7ae38192e..94d929a9a 100644
--- a/src/main/java/com/autotune/analyzer/services/UpdateResults.java
+++ b/src/main/java/com/autotune/analyzer/services/UpdateResults.java
@@ -16,6 +16,7 @@
package com.autotune.analyzer.services;
+import com.autotune.analyzer.adapters.RecommendationItemAdapter;
import com.autotune.analyzer.exceptions.KruizeResponse;
import com.autotune.analyzer.experiment.ExperimentInitiator;
import com.autotune.analyzer.performanceProfiles.PerformanceProfile;
@@ -78,6 +79,7 @@ protected void doPost(HttpServletRequest request, HttpServletResponse response)
Gson gson = new GsonBuilder()
.registerTypeAdapter(Double.class, new CustomNumberDeserializer())
.registerTypeAdapter(Integer.class, new CustomNumberDeserializer())
+ .registerTypeAdapter(AnalyzerConstants.RecommendationItem.class, new RecommendationItemAdapter())
.create();
LOGGER.debug("updateResults API request payload for requestID {} is {}", calCount, inputData);
try {
diff --git a/src/main/java/com/autotune/database/helper/DBHelpers.java b/src/main/java/com/autotune/database/helper/DBHelpers.java
index 23069d348..caab21895 100644
--- a/src/main/java/com/autotune/database/helper/DBHelpers.java
+++ b/src/main/java/com/autotune/database/helper/DBHelpers.java
@@ -16,6 +16,7 @@
package com.autotune.database.helper;
+import com.autotune.analyzer.adapters.RecommendationItemAdapter;
import com.autotune.analyzer.exceptions.InvalidConversionOfRecommendationEntryException;
import com.autotune.analyzer.kruizeObject.KruizeObject;
import com.autotune.analyzer.kruizeObject.SloInfo;
@@ -334,6 +335,7 @@ public static KruizeResultsEntry convertExperimentResultToExperimentResultsTable
.enableComplexMapKeySerialization()
.setDateFormat(KruizeConstants.DateFormats.STANDARD_JSON_DATE_FORMAT)
.registerTypeAdapter(Date.class, new GsonUTCDateAdapter())
+ .registerTypeAdapter(AnalyzerConstants.RecommendationItem.class, new RecommendationItemAdapter())
.create();
try {
kruizeResultsEntry = new KruizeResultsEntry();
@@ -473,6 +475,7 @@ public static KruizeRecommendationEntry convertKruizeObjectTORecommendation(Krui
.enableComplexMapKeySerialization()
.setDateFormat(KruizeConstants.DateFormats.STANDARD_JSON_DATE_FORMAT)
.registerTypeAdapter(Date.class, new GsonUTCDateAdapter())
+ .registerTypeAdapter(AnalyzerConstants.RecommendationItem.class, new RecommendationItemAdapter())
.create();
try {
ListRecommendationsAPIObject listRecommendationsAPIObject = getListRecommendationAPIObjectForDB(
@@ -480,7 +483,11 @@ public static KruizeRecommendationEntry convertKruizeObjectTORecommendation(Krui
if (null == listRecommendationsAPIObject) {
return null;
}
- LOGGER.debug(new GsonBuilder().setPrettyPrinting().create().toJson(listRecommendationsAPIObject));
+ LOGGER.debug(new GsonBuilder()
+ .setPrettyPrinting()
+ .registerTypeAdapter(AnalyzerConstants.RecommendationItem.class, new RecommendationItemAdapter())
+ .create()
+ .toJson(listRecommendationsAPIObject));
kruizeRecommendationEntry = new KruizeRecommendationEntry();
kruizeRecommendationEntry.setVersion(KruizeConstants.KRUIZE_RECOMMENDATION_API_VERSION.LATEST.getVersionNumber());
kruizeRecommendationEntry.setExperiment_name(listRecommendationsAPIObject.getExperimentName());
@@ -557,6 +564,7 @@ public static List convertResultEntryToUpdateResultsAPIO
.enableComplexMapKeySerialization()
.setDateFormat(KruizeConstants.DateFormats.STANDARD_JSON_DATE_FORMAT)
.registerTypeAdapter(Date.class, new GsonUTCDateAdapter())
+ .registerTypeAdapter(AnalyzerConstants.RecommendationItem.class, new RecommendationItemAdapter())
.create();
List updateResultsAPIObjects = new ArrayList<>();
for (KruizeResultsEntry kruizeResultsEntry : kruizeResultsEntries) {
@@ -626,6 +634,7 @@ public static List convertRecommendationEntryToRec
.enableComplexMapKeySerialization()
.setDateFormat(KruizeConstants.DateFormats.STANDARD_JSON_DATE_FORMAT)
.registerTypeAdapter(Date.class, new GsonUTCDateAdapter())
+ .registerTypeAdapter(AnalyzerConstants.RecommendationItem.class, new RecommendationItemAdapter())
.create();
List listRecommendationsAPIObjectList = new ArrayList<>();
for (KruizeRecommendationEntry kruizeRecommendationEntry : kruizeRecommendationEntryList) {
diff --git a/src/main/java/com/autotune/utils/Utils.java b/src/main/java/com/autotune/utils/Utils.java
index 3d65dea4c..4b94b3270 100644
--- a/src/main/java/com/autotune/utils/Utils.java
+++ b/src/main/java/com/autotune/utils/Utils.java
@@ -16,6 +16,7 @@
package com.autotune.utils;
+import com.autotune.analyzer.adapters.RecommendationItemAdapter;
import com.autotune.analyzer.utils.AnalyzerConstants;
import com.autotune.analyzer.utils.GsonUTCDateAdapter;
import com.autotune.common.data.result.ContainerData;
@@ -169,6 +170,7 @@ public static T getClone(T object, Class classMetadata) {
.setPrettyPrinting()
.enableComplexMapKeySerialization()
.registerTypeAdapter(Date.class, new GsonUTCDateAdapter())
+ .registerTypeAdapter(AnalyzerConstants.RecommendationItem.class, new RecommendationItemAdapter())
.create();
String serialisedString = gson.toJson(object);
From bd821753c9f3205b10414530723bb55c80fc03a3 Mon Sep 17 00:00:00 2001
From: bharathappali
Date: Fri, 4 Oct 2024 14:30:19 +0530
Subject: [PATCH 19/64] Add adapter for DeviceDetails
Signed-off-by: bharathappali
---
.../adapters/DeviceDetailsAdapter.java | 79 +++++++++++++++++++
.../analyzer/services/DSMetadataService.java | 3 +
.../services/GenerateRecommendations.java | 3 +
.../analyzer/services/ListDatasources.java | 3 +
.../analyzer/services/ListExperiments.java | 3 +
.../services/ListRecommendations.java | 3 +
.../services/ListSupportedK8sObjects.java | 3 +
.../services/MetricProfileService.java | 3 +
.../services/PerformanceProfileService.java | 3 +
.../services/UpdateRecommendations.java | 3 +
.../analyzer/services/UpdateResults.java | 3 +
.../autotune/database/helper/DBHelpers.java | 7 ++
12 files changed, 116 insertions(+)
create mode 100644 src/main/java/com/autotune/analyzer/adapters/DeviceDetailsAdapter.java
diff --git a/src/main/java/com/autotune/analyzer/adapters/DeviceDetailsAdapter.java b/src/main/java/com/autotune/analyzer/adapters/DeviceDetailsAdapter.java
new file mode 100644
index 000000000..cfe5ff150
--- /dev/null
+++ b/src/main/java/com/autotune/analyzer/adapters/DeviceDetailsAdapter.java
@@ -0,0 +1,79 @@
+package com.autotune.analyzer.adapters;
+
+import com.autotune.analyzer.utils.AnalyzerConstants;
+import com.autotune.common.data.system.info.device.DeviceDetails;
+import com.autotune.common.data.system.info.device.accelerator.AcceleratorDeviceData;
+import com.google.gson.TypeAdapter;
+import com.google.gson.stream.JsonReader;
+import com.google.gson.stream.JsonWriter;
+import java.io.IOException;
+
+public class DeviceDetailsAdapter extends TypeAdapter {
+
+ @Override
+ public void write(JsonWriter out, DeviceDetails value) throws IOException {
+ out.beginObject();
+ out.name("type").value(value.getType().name());
+
+ if (value instanceof AcceleratorDeviceData accelerator) {
+ out.name("manufacturer").value(accelerator.getManufacturer());
+ out.name("modelName").value(accelerator.getModelName());
+ out.name("hostName").value(accelerator.getHostName());
+ out.name("UUID").value(accelerator.getUUID());
+ out.name("deviceName").value(accelerator.getDeviceName());
+ out.name("isMIG").value(accelerator.isMIG());
+ }
+ // Add for other devices when added
+
+ out.endObject();
+ }
+
+ @Override
+ public DeviceDetails read(JsonReader in) throws IOException {
+ String type = null;
+ String manufacturer = null;
+ String modelName = null;
+ String hostName = null;
+ String UUID = null;
+ String deviceName = null;
+ boolean isMIG = false;
+
+ in.beginObject();
+ while (in.hasNext()) {
+ switch (in.nextName()) {
+ case "type":
+ type = in.nextString();
+ break;
+ case "manufacturer":
+ manufacturer = in.nextString();
+ break;
+ case "modelName":
+ modelName = in.nextString();
+ break;
+ case "hostName":
+ hostName = in.nextString();
+ break;
+ case "UUID":
+ UUID = in.nextString();
+ break;
+ case "deviceName":
+ deviceName = in.nextString();
+ break;
+ case "isMIG":
+ isMIG = in.nextBoolean();
+ break;
+ default:
+ in.skipValue();
+ }
+ }
+ in.endObject();
+
+ if (type != null && type.equals(AnalyzerConstants.DeviceType.ACCELERATOR.name())) {
+ return (DeviceDetails) new AcceleratorDeviceData(modelName, hostName, UUID, deviceName, isMIG);
+ }
+ // Add for other device types if implemented in future
+
+ return null;
+ }
+}
+
diff --git a/src/main/java/com/autotune/analyzer/services/DSMetadataService.java b/src/main/java/com/autotune/analyzer/services/DSMetadataService.java
index c3db126ad..f2188714d 100644
--- a/src/main/java/com/autotune/analyzer/services/DSMetadataService.java
+++ b/src/main/java/com/autotune/analyzer/services/DSMetadataService.java
@@ -16,6 +16,7 @@
package com.autotune.analyzer.services;
+import com.autotune.analyzer.adapters.DeviceDetailsAdapter;
import com.autotune.analyzer.adapters.RecommendationItemAdapter;
import com.autotune.analyzer.exceptions.KruizeResponse;
import com.autotune.analyzer.serviceObjects.DSMetadataAPIObject;
@@ -24,6 +25,7 @@
import com.autotune.analyzer.utils.GsonUTCDateAdapter;
import com.autotune.common.data.ValidationOutputData;
import com.autotune.common.data.dataSourceMetadata.DataSourceMetadataInfo;
+import com.autotune.common.data.system.info.device.DeviceDetails;
import com.autotune.common.datasource.DataSourceInfo;
import com.autotune.common.datasource.DataSourceManager;
import com.autotune.common.datasource.DataSourceMetadataValidation;
@@ -419,6 +421,7 @@ private Gson createGsonObject() {
.enableComplexMapKeySerialization()
.registerTypeAdapter(Date.class, new GsonUTCDateAdapter())
.registerTypeAdapter(AnalyzerConstants.RecommendationItem.class, new RecommendationItemAdapter())
+ .registerTypeAdapter(DeviceDetails.class, new DeviceDetailsAdapter())
.create();
}
private boolean isValidBooleanValue(String value) {
diff --git a/src/main/java/com/autotune/analyzer/services/GenerateRecommendations.java b/src/main/java/com/autotune/analyzer/services/GenerateRecommendations.java
index 0b7d183fa..8a2d5f22c 100644
--- a/src/main/java/com/autotune/analyzer/services/GenerateRecommendations.java
+++ b/src/main/java/com/autotune/analyzer/services/GenerateRecommendations.java
@@ -15,6 +15,7 @@
*******************************************************************************/
package com.autotune.analyzer.services;
+import com.autotune.analyzer.adapters.DeviceDetailsAdapter;
import com.autotune.analyzer.adapters.RecommendationItemAdapter;
import com.autotune.analyzer.exceptions.FetchMetricsError;
import com.autotune.analyzer.kruizeObject.KruizeObject;
@@ -30,6 +31,7 @@
import com.autotune.common.data.metrics.MetricResults;
import com.autotune.common.data.result.ContainerData;
import com.autotune.common.data.result.IntervalResults;
+import com.autotune.common.data.system.info.device.DeviceDetails;
import com.autotune.common.datasource.DataSourceInfo;
import com.autotune.common.k8sObjects.K8sObject;
import com.autotune.utils.GenericRestApiClient;
@@ -173,6 +175,7 @@ public boolean shouldSkipClass(Class> clazz) {
.enableComplexMapKeySerialization()
.registerTypeAdapter(Date.class, new GsonUTCDateAdapter())
.registerTypeAdapter(AnalyzerConstants.RecommendationItem.class, new RecommendationItemAdapter())
+ .registerTypeAdapter(DeviceDetails.class, new DeviceDetailsAdapter())
.setExclusionStrategies(strategy)
.create();
gsonStr = gsonObj.toJson(recommendationList);
diff --git a/src/main/java/com/autotune/analyzer/services/ListDatasources.java b/src/main/java/com/autotune/analyzer/services/ListDatasources.java
index 2e870edbb..9493f3ad5 100644
--- a/src/main/java/com/autotune/analyzer/services/ListDatasources.java
+++ b/src/main/java/com/autotune/analyzer/services/ListDatasources.java
@@ -16,11 +16,13 @@
package com.autotune.analyzer.services;
+import com.autotune.analyzer.adapters.DeviceDetailsAdapter;
import com.autotune.analyzer.adapters.RecommendationItemAdapter;
import com.autotune.analyzer.serviceObjects.ListDatasourcesAPIObject;
import com.autotune.analyzer.utils.AnalyzerConstants;
import com.autotune.analyzer.utils.AnalyzerErrorConstants;
import com.autotune.analyzer.utils.GsonUTCDateAdapter;
+import com.autotune.common.data.system.info.device.DeviceDetails;
import com.autotune.common.datasource.DataSourceInfo;
import com.autotune.database.service.ExperimentDBService;
import com.autotune.utils.MetricsConfig;
@@ -150,6 +152,7 @@ private Gson createGsonObject() {
.enableComplexMapKeySerialization()
.registerTypeAdapter(Date.class, new GsonUTCDateAdapter())
.registerTypeAdapter(AnalyzerConstants.RecommendationItem.class, new RecommendationItemAdapter())
+ .registerTypeAdapter(DeviceDetails.class, new DeviceDetailsAdapter())
.create();
}
diff --git a/src/main/java/com/autotune/analyzer/services/ListExperiments.java b/src/main/java/com/autotune/analyzer/services/ListExperiments.java
index be23a1596..b8ca71447 100644
--- a/src/main/java/com/autotune/analyzer/services/ListExperiments.java
+++ b/src/main/java/com/autotune/analyzer/services/ListExperiments.java
@@ -16,6 +16,7 @@
package com.autotune.analyzer.services;
+import com.autotune.analyzer.adapters.DeviceDetailsAdapter;
import com.autotune.analyzer.adapters.RecommendationItemAdapter;
import com.autotune.analyzer.experiment.KruizeExperiment;
import com.autotune.analyzer.kruizeObject.KruizeObject;
@@ -30,6 +31,7 @@
import com.autotune.common.data.metrics.MetricResults;
import com.autotune.common.data.result.ContainerData;
import com.autotune.common.data.result.IntervalResults;
+import com.autotune.common.data.system.info.device.DeviceDetails;
import com.autotune.common.k8sObjects.K8sObject;
import com.autotune.common.target.kubernetes.service.KubernetesServices;
import com.autotune.common.trials.ExperimentTrial;
@@ -283,6 +285,7 @@ private Gson createGsonObject() {
.enableComplexMapKeySerialization()
.registerTypeAdapter(Date.class, new GsonUTCDateAdapter())
.registerTypeAdapter(AnalyzerConstants.RecommendationItem.class, new RecommendationItemAdapter())
+ .registerTypeAdapter(DeviceDetails.class, new DeviceDetailsAdapter())
.setExclusionStrategies(new ExclusionStrategy() {
@Override
public boolean shouldSkipField(FieldAttributes f) {
diff --git a/src/main/java/com/autotune/analyzer/services/ListRecommendations.java b/src/main/java/com/autotune/analyzer/services/ListRecommendations.java
index c0dcfa942..ee533905f 100644
--- a/src/main/java/com/autotune/analyzer/services/ListRecommendations.java
+++ b/src/main/java/com/autotune/analyzer/services/ListRecommendations.java
@@ -16,6 +16,7 @@
package com.autotune.analyzer.services;
+import com.autotune.analyzer.adapters.DeviceDetailsAdapter;
import com.autotune.analyzer.adapters.RecommendationItemAdapter;
import com.autotune.analyzer.exceptions.KruizeResponse;
import com.autotune.analyzer.kruizeObject.KruizeObject;
@@ -27,6 +28,7 @@
import com.autotune.analyzer.utils.GsonUTCDateAdapter;
import com.autotune.analyzer.utils.ServiceHelpers;
import com.autotune.common.data.result.ContainerData;
+import com.autotune.common.data.system.info.device.DeviceDetails;
import com.autotune.database.service.ExperimentDBService;
import com.autotune.utils.KruizeConstants;
import com.autotune.utils.MetricsConfig;
@@ -226,6 +228,7 @@ public boolean shouldSkipClass(Class> clazz) {
.enableComplexMapKeySerialization()
.registerTypeAdapter(Date.class, new GsonUTCDateAdapter())
.registerTypeAdapter(AnalyzerConstants.RecommendationItem.class, new RecommendationItemAdapter())
+ .registerTypeAdapter(DeviceDetails.class, new DeviceDetailsAdapter())
.setExclusionStrategies(strategy)
.create();
gsonStr = gsonObj.toJson(recommendationList);
diff --git a/src/main/java/com/autotune/analyzer/services/ListSupportedK8sObjects.java b/src/main/java/com/autotune/analyzer/services/ListSupportedK8sObjects.java
index c67a2659d..f0b2db569 100644
--- a/src/main/java/com/autotune/analyzer/services/ListSupportedK8sObjects.java
+++ b/src/main/java/com/autotune/analyzer/services/ListSupportedK8sObjects.java
@@ -15,10 +15,12 @@
*******************************************************************************/
package com.autotune.analyzer.services;
+import com.autotune.analyzer.adapters.DeviceDetailsAdapter;
import com.autotune.analyzer.adapters.RecommendationItemAdapter;
import com.autotune.analyzer.serviceObjects.ListSupportedK8sObjectsSO;
import com.autotune.analyzer.utils.GsonUTCDateAdapter;
import com.autotune.analyzer.utils.AnalyzerConstants;
+import com.autotune.common.data.system.info.device.DeviceDetails;
import com.autotune.utils.Utils;
import com.google.gson.Gson;
import com.google.gson.GsonBuilder;
@@ -59,6 +61,7 @@ protected void doPost(HttpServletRequest request, HttpServletResponse response)
.enableComplexMapKeySerialization()
.registerTypeAdapter(Date.class, new GsonUTCDateAdapter())
.registerTypeAdapter(AnalyzerConstants.RecommendationItem.class, new RecommendationItemAdapter())
+ .registerTypeAdapter(DeviceDetails.class, new DeviceDetailsAdapter())
.create();
// Convert the Service object to JSON
responseGSONString = gsonObj.toJson(listSupportedK8sObjectsSO);
diff --git a/src/main/java/com/autotune/analyzer/services/MetricProfileService.java b/src/main/java/com/autotune/analyzer/services/MetricProfileService.java
index 6f92e186e..ca5372c0e 100644
--- a/src/main/java/com/autotune/analyzer/services/MetricProfileService.java
+++ b/src/main/java/com/autotune/analyzer/services/MetricProfileService.java
@@ -16,6 +16,7 @@
package com.autotune.analyzer.services;
+import com.autotune.analyzer.adapters.DeviceDetailsAdapter;
import com.autotune.analyzer.adapters.RecommendationItemAdapter;
import com.autotune.analyzer.exceptions.InvalidValueException;
import com.autotune.analyzer.exceptions.PerformanceProfileResponse;
@@ -29,6 +30,7 @@
import com.autotune.common.data.ValidationOutputData;
import com.autotune.common.data.metrics.Metric;
import com.autotune.common.data.result.ContainerData;
+import com.autotune.common.data.system.info.device.DeviceDetails;
import com.autotune.database.dao.ExperimentDAOImpl;
import com.autotune.database.service.ExperimentDBService;
import com.autotune.utils.KruizeConstants;
@@ -380,6 +382,7 @@ private Gson createGsonObject() {
.enableComplexMapKeySerialization()
.registerTypeAdapter(Date.class, new GsonUTCDateAdapter())
.registerTypeAdapter(AnalyzerConstants.RecommendationItem.class, new RecommendationItemAdapter())
+ .registerTypeAdapter(DeviceDetails.class, new DeviceDetailsAdapter())
// a custom serializer for serializing metadata of JsonNode type.
.registerTypeAdapter(JsonNode.class, new JsonSerializer() {
@Override
diff --git a/src/main/java/com/autotune/analyzer/services/PerformanceProfileService.java b/src/main/java/com/autotune/analyzer/services/PerformanceProfileService.java
index 3ea3a207f..43cc8588f 100644
--- a/src/main/java/com/autotune/analyzer/services/PerformanceProfileService.java
+++ b/src/main/java/com/autotune/analyzer/services/PerformanceProfileService.java
@@ -16,6 +16,7 @@
package com.autotune.analyzer.services;
+import com.autotune.analyzer.adapters.DeviceDetailsAdapter;
import com.autotune.analyzer.adapters.RecommendationItemAdapter;
import com.autotune.analyzer.exceptions.InvalidValueException;
import com.autotune.analyzer.exceptions.PerformanceProfileResponse;
@@ -27,6 +28,7 @@
import com.autotune.analyzer.utils.GsonUTCDateAdapter;
import com.autotune.common.data.ValidationOutputData;
import com.autotune.common.data.metrics.Metric;
+import com.autotune.common.data.system.info.device.DeviceDetails;
import com.autotune.database.service.ExperimentDBService;
import com.google.gson.ExclusionStrategy;
import com.google.gson.FieldAttributes;
@@ -132,6 +134,7 @@ protected void doGet(HttpServletRequest req, HttpServletResponse response) throw
.enableComplexMapKeySerialization()
.registerTypeAdapter(Date.class, new GsonUTCDateAdapter())
.registerTypeAdapter(AnalyzerConstants.RecommendationItem.class, new RecommendationItemAdapter())
+ .registerTypeAdapter(DeviceDetails.class, new DeviceDetailsAdapter())
.setExclusionStrategies(new ExclusionStrategy() {
@Override
public boolean shouldSkipField(FieldAttributes f) {
diff --git a/src/main/java/com/autotune/analyzer/services/UpdateRecommendations.java b/src/main/java/com/autotune/analyzer/services/UpdateRecommendations.java
index 45218d07b..e558d1d37 100644
--- a/src/main/java/com/autotune/analyzer/services/UpdateRecommendations.java
+++ b/src/main/java/com/autotune/analyzer/services/UpdateRecommendations.java
@@ -15,6 +15,7 @@
*******************************************************************************/
package com.autotune.analyzer.services;
+import com.autotune.analyzer.adapters.DeviceDetailsAdapter;
import com.autotune.analyzer.adapters.RecommendationItemAdapter;
import com.autotune.analyzer.exceptions.FetchMetricsError;
import com.autotune.analyzer.kruizeObject.KruizeObject;
@@ -26,6 +27,7 @@
import com.autotune.analyzer.utils.AnalyzerErrorConstants;
import com.autotune.analyzer.utils.GsonUTCDateAdapter;
import com.autotune.common.data.result.ContainerData;
+import com.autotune.common.data.system.info.device.DeviceDetails;
import com.autotune.operator.KruizeDeploymentInfo;
import com.autotune.utils.KruizeConstants;
import com.autotune.utils.MetricsConfig;
@@ -171,6 +173,7 @@ public boolean shouldSkipClass(Class> clazz) {
.enableComplexMapKeySerialization()
.registerTypeAdapter(Date.class, new GsonUTCDateAdapter())
.registerTypeAdapter(AnalyzerConstants.RecommendationItem.class, new RecommendationItemAdapter())
+ .registerTypeAdapter(DeviceDetails.class, new DeviceDetailsAdapter())
.setExclusionStrategies(strategy)
.create();
gsonStr = gsonObj.toJson(recommendationList);
diff --git a/src/main/java/com/autotune/analyzer/services/UpdateResults.java b/src/main/java/com/autotune/analyzer/services/UpdateResults.java
index 94d929a9a..a5d8bbd79 100644
--- a/src/main/java/com/autotune/analyzer/services/UpdateResults.java
+++ b/src/main/java/com/autotune/analyzer/services/UpdateResults.java
@@ -16,6 +16,7 @@
package com.autotune.analyzer.services;
+import com.autotune.analyzer.adapters.DeviceDetailsAdapter;
import com.autotune.analyzer.adapters.RecommendationItemAdapter;
import com.autotune.analyzer.exceptions.KruizeResponse;
import com.autotune.analyzer.experiment.ExperimentInitiator;
@@ -24,6 +25,7 @@
import com.autotune.analyzer.serviceObjects.UpdateResultsAPIObject;
import com.autotune.analyzer.utils.AnalyzerConstants;
import com.autotune.analyzer.utils.AnalyzerErrorConstants;
+import com.autotune.common.data.system.info.device.DeviceDetails;
import com.autotune.operator.KruizeDeploymentInfo;
import com.autotune.utils.MetricsConfig;
import com.google.gson.*;
@@ -80,6 +82,7 @@ protected void doPost(HttpServletRequest request, HttpServletResponse response)
.registerTypeAdapter(Double.class, new CustomNumberDeserializer())
.registerTypeAdapter(Integer.class, new CustomNumberDeserializer())
.registerTypeAdapter(AnalyzerConstants.RecommendationItem.class, new RecommendationItemAdapter())
+ .registerTypeAdapter(DeviceDetails.class, new DeviceDetailsAdapter())
.create();
LOGGER.debug("updateResults API request payload for requestID {} is {}", calCount, inputData);
try {
diff --git a/src/main/java/com/autotune/database/helper/DBHelpers.java b/src/main/java/com/autotune/database/helper/DBHelpers.java
index caab21895..512cf7e3e 100644
--- a/src/main/java/com/autotune/database/helper/DBHelpers.java
+++ b/src/main/java/com/autotune/database/helper/DBHelpers.java
@@ -16,6 +16,7 @@
package com.autotune.database.helper;
+import com.autotune.analyzer.adapters.DeviceDetailsAdapter;
import com.autotune.analyzer.adapters.RecommendationItemAdapter;
import com.autotune.analyzer.exceptions.InvalidConversionOfRecommendationEntryException;
import com.autotune.analyzer.kruizeObject.KruizeObject;
@@ -33,6 +34,7 @@
import com.autotune.common.data.result.ContainerData;
import com.autotune.common.data.result.ExperimentResultData;
import com.autotune.common.data.result.NamespaceData;
+import com.autotune.common.data.system.info.device.DeviceDetails;
import com.autotune.common.datasource.DataSourceCollection;
import com.autotune.common.datasource.DataSourceInfo;
import com.autotune.common.datasource.DataSourceMetadataOperator;
@@ -336,6 +338,7 @@ public static KruizeResultsEntry convertExperimentResultToExperimentResultsTable
.setDateFormat(KruizeConstants.DateFormats.STANDARD_JSON_DATE_FORMAT)
.registerTypeAdapter(Date.class, new GsonUTCDateAdapter())
.registerTypeAdapter(AnalyzerConstants.RecommendationItem.class, new RecommendationItemAdapter())
+ .registerTypeAdapter(DeviceDetails.class, new DeviceDetailsAdapter())
.create();
try {
kruizeResultsEntry = new KruizeResultsEntry();
@@ -476,6 +479,7 @@ public static KruizeRecommendationEntry convertKruizeObjectTORecommendation(Krui
.setDateFormat(KruizeConstants.DateFormats.STANDARD_JSON_DATE_FORMAT)
.registerTypeAdapter(Date.class, new GsonUTCDateAdapter())
.registerTypeAdapter(AnalyzerConstants.RecommendationItem.class, new RecommendationItemAdapter())
+ .registerTypeAdapter(DeviceDetails.class, new DeviceDetailsAdapter())
.create();
try {
ListRecommendationsAPIObject listRecommendationsAPIObject = getListRecommendationAPIObjectForDB(
@@ -486,6 +490,7 @@ public static KruizeRecommendationEntry convertKruizeObjectTORecommendation(Krui
LOGGER.debug(new GsonBuilder()
.setPrettyPrinting()
.registerTypeAdapter(AnalyzerConstants.RecommendationItem.class, new RecommendationItemAdapter())
+ .registerTypeAdapter(DeviceDetails.class, new DeviceDetailsAdapter())
.create()
.toJson(listRecommendationsAPIObject));
kruizeRecommendationEntry = new KruizeRecommendationEntry();
@@ -565,6 +570,7 @@ public static List convertResultEntryToUpdateResultsAPIO
.setDateFormat(KruizeConstants.DateFormats.STANDARD_JSON_DATE_FORMAT)
.registerTypeAdapter(Date.class, new GsonUTCDateAdapter())
.registerTypeAdapter(AnalyzerConstants.RecommendationItem.class, new RecommendationItemAdapter())
+ .registerTypeAdapter(DeviceDetails.class, new DeviceDetailsAdapter())
.create();
List updateResultsAPIObjects = new ArrayList<>();
for (KruizeResultsEntry kruizeResultsEntry : kruizeResultsEntries) {
@@ -635,6 +641,7 @@ public static List convertRecommendationEntryToRec
.setDateFormat(KruizeConstants.DateFormats.STANDARD_JSON_DATE_FORMAT)
.registerTypeAdapter(Date.class, new GsonUTCDateAdapter())
.registerTypeAdapter(AnalyzerConstants.RecommendationItem.class, new RecommendationItemAdapter())
+ .registerTypeAdapter(DeviceDetails.class, new DeviceDetailsAdapter())
.create();
List listRecommendationsAPIObjectList = new ArrayList<>();
for (KruizeRecommendationEntry kruizeRecommendationEntry : kruizeRecommendationEntryList) {
From ff1873c1aac7a055b8018cbf1988f30560dbc2bc Mon Sep 17 00:00:00 2001
From: bharathappali
Date: Fri, 4 Oct 2024 14:43:09 +0530
Subject: [PATCH 20/64] Add adapter to utils
Signed-off-by: bharathappali
---
src/main/java/com/autotune/utils/Utils.java | 3 +++
1 file changed, 3 insertions(+)
diff --git a/src/main/java/com/autotune/utils/Utils.java b/src/main/java/com/autotune/utils/Utils.java
index 4b94b3270..1b3b281de 100644
--- a/src/main/java/com/autotune/utils/Utils.java
+++ b/src/main/java/com/autotune/utils/Utils.java
@@ -16,10 +16,12 @@
package com.autotune.utils;
+import com.autotune.analyzer.adapters.DeviceDetailsAdapter;
import com.autotune.analyzer.adapters.RecommendationItemAdapter;
import com.autotune.analyzer.utils.AnalyzerConstants;
import com.autotune.analyzer.utils.GsonUTCDateAdapter;
import com.autotune.common.data.result.ContainerData;
+import com.autotune.common.data.system.info.device.DeviceDetails;
import com.google.gson.ExclusionStrategy;
import com.google.gson.FieldAttributes;
import com.google.gson.Gson;
@@ -171,6 +173,7 @@ public static T getClone(T object, Class classMetadata) {
.enableComplexMapKeySerialization()
.registerTypeAdapter(Date.class, new GsonUTCDateAdapter())
.registerTypeAdapter(AnalyzerConstants.RecommendationItem.class, new RecommendationItemAdapter())
+ .registerTypeAdapter(DeviceDetails.class, new DeviceDetailsAdapter())
.create();
String serialisedString = gson.toJson(object);
From c7d293e20fd5d40e685627e07076032339b8e6b3 Mon Sep 17 00:00:00 2001
From: bharathappali
Date: Tue, 8 Oct 2024 14:39:19 +0530
Subject: [PATCH 21/64] add javadoc for DeviceDetailsAdapter
Signed-off-by: bharathappali
---
.../com/autotune/analyzer/adapters/DeviceDetailsAdapter.java | 5 +++++
1 file changed, 5 insertions(+)
diff --git a/src/main/java/com/autotune/analyzer/adapters/DeviceDetailsAdapter.java b/src/main/java/com/autotune/analyzer/adapters/DeviceDetailsAdapter.java
index cfe5ff150..57ceaf735 100644
--- a/src/main/java/com/autotune/analyzer/adapters/DeviceDetailsAdapter.java
+++ b/src/main/java/com/autotune/analyzer/adapters/DeviceDetailsAdapter.java
@@ -8,6 +8,11 @@
import com.google.gson.stream.JsonWriter;
import java.io.IOException;
+
+/**
+ * This adapter actually specifies the GSON to identify the type of implementation of DeviceDetails
+ * to serialize or deserialize
+ */
public class DeviceDetailsAdapter extends TypeAdapter {
@Override
From 3e894fdb6e52a7a690606237639707d829883acb Mon Sep 17 00:00:00 2001
From: bharathappali
Date: Tue, 8 Oct 2024 14:43:08 +0530
Subject: [PATCH 22/64] Add javadoc for RecommendationItemAdapter
Signed-off-by: bharathappali
---
.../adapters/RecommendationItemAdapter.java | 17 +++++++++++++++++
1 file changed, 17 insertions(+)
diff --git a/src/main/java/com/autotune/analyzer/adapters/RecommendationItemAdapter.java b/src/main/java/com/autotune/analyzer/adapters/RecommendationItemAdapter.java
index 7f7806e3c..79139fbc4 100644
--- a/src/main/java/com/autotune/analyzer/adapters/RecommendationItemAdapter.java
+++ b/src/main/java/com/autotune/analyzer/adapters/RecommendationItemAdapter.java
@@ -6,6 +6,23 @@
import java.lang.reflect.Type;
+/**
+ * Earlier the RecommendationItem enum has only two entries cpu and memory.
+ * At the time if serialization (store in DB or return as JSON via API)
+ * java has handled the toString conversion and have converted them to "cpu" and "memory" strings.
+ * They are also keys in the recommendation (requests & limits)
+ *
+ * But in case of NVIDIA the resources have / and . in their string representation of the MIG name.
+ * So we cannot add them as enums as is, So we had to create an entry which accepts a string
+ * and then the toString returns the string value of it.
+ *
+ * At the time of deserailization the string entries are converted to enum entries and vice versa in serialization.
+ * For example if the entry is NVIDIA_GPU_PARTITION_1_CORE_5GB("nvidia.com/mig-1g.5gb") then tostring of it
+ * will be nvidia.com/mig-1g.5gb which will not match the enum entry NVIDIA_GPU_PARTITION_1_CORE_5GB
+ *
+ * Also to maintain consistency we changed the cpu to CPU so without the adapter
+ * the JSON will be generated with CPU as the key.
+ */
public class RecommendationItemAdapter implements JsonSerializer, JsonDeserializer {
@Override
public JsonElement serialize(AnalyzerConstants.RecommendationItem recommendationItem, Type type, JsonSerializationContext jsonSerializationContext) {
From f883e0cc8a20d0b4ae3df9dc2e194354753ef6a4 Mon Sep 17 00:00:00 2001
From: bharathappali
Date: Fri, 4 Oct 2024 10:35:28 +0530
Subject: [PATCH 23/64] Add recommendation generation logic
Signed-off-by: bharathappali
---
.../model/PerformanceBasedRecommendationModel.java | 1 +
1 file changed, 1 insertion(+)
diff --git a/src/main/java/com/autotune/analyzer/recommendations/model/PerformanceBasedRecommendationModel.java b/src/main/java/com/autotune/analyzer/recommendations/model/PerformanceBasedRecommendationModel.java
index 247bd93be..0cd9eee41 100644
--- a/src/main/java/com/autotune/analyzer/recommendations/model/PerformanceBasedRecommendationModel.java
+++ b/src/main/java/com/autotune/analyzer/recommendations/model/PerformanceBasedRecommendationModel.java
@@ -400,6 +400,7 @@ public Map getAc
&& RecommendationUtils.checkIfModelIsKruizeSupportedMIG(gpuMetricResult.getAcceleratorDeviceData().getModelName())
) {
String obtainedAcceleratorName = RecommendationUtils.getSupportedModelBasedOnModelName(gpuMetricResult.getAcceleratorDeviceData().getModelName());
+
if (null != obtainedAcceleratorName)
acceleratorModel = obtainedAcceleratorName;
}
From 93a9fad84b6eb5bcd21993dc70d23754a58e2678 Mon Sep 17 00:00:00 2001
From: bharathappali
Date: Fri, 4 Oct 2024 14:57:41 +0530
Subject: [PATCH 24/64] Add docs for sample recommendation JSON
Signed-off-by: bharathappali
---
design/MonitoringModeAPI.md | 505 ++++++++++++++++++++++++++++++++++++
1 file changed, 505 insertions(+)
diff --git a/design/MonitoringModeAPI.md b/design/MonitoringModeAPI.md
index 75899125d..91a3d1364 100644
--- a/design/MonitoringModeAPI.md
+++ b/design/MonitoringModeAPI.md
@@ -2960,6 +2960,506 @@ Returns the recommendation at a particular timestamp if it exists
+
+**Response for GPU workloads**
+
+`GET /listRecommendations`
+
+`curl -H 'Accept: application/json' http://:/listRecommendations?experiment_name=job-01`
+
+
+Example Response with GPU Recommendations
+
+```json
+[
+ {
+ "cluster_name": "default",
+ "experiment_type": "container",
+ "kubernetes_objects": [
+ {
+ "type": "statefulset",
+ "name": "human-eval-benchmark",
+ "namespace": "unpartitioned",
+ "containers": [
+ {
+ "container_name": "human-eval-benchmark",
+ "recommendations": {
+ "version": "1.0",
+ "notifications": {
+ "111000": {
+ "type": "info",
+ "message": "Recommendations Are Available",
+ "code": 111000
+ }
+ },
+ "data": {
+ "2024-10-04T09:16:40.000Z": {
+ "notifications": {
+ "111101": {
+ "type": "info",
+ "message": "Short Term Recommendations Available",
+ "code": 111101
+ },
+ "111102": {
+ "type": "info",
+ "message": "Medium Term Recommendations Available",
+ "code": 111102
+ }
+ },
+ "monitoring_end_time": "2024-10-04T09:16:40.000Z",
+ "current": {
+ "limits": {
+ "cpu": {
+ "amount": 2.0,
+ "format": "cores"
+ },
+ "memory": {
+ "amount": 8.589934592E9,
+ "format": "bytes"
+ }
+ },
+ "requests": {
+ "cpu": {
+ "amount": 1.0,
+ "format": "cores"
+ },
+ "memory": {
+ "amount": 8.589934592E9,
+ "format": "bytes"
+ }
+ }
+ },
+ "recommendation_terms": {
+ "short_term": {
+ "duration_in_hours": 24.0,
+ "notifications": {
+ "112101": {
+ "type": "info",
+ "message": "Cost Recommendations Available",
+ "code": 112101
+ },
+ "112102": {
+ "type": "info",
+ "message": "Performance Recommendations Available",
+ "code": 112102
+ }
+ },
+ "monitoring_start_time": "2024-10-03T09:16:40.000Z",
+ "recommendation_engines": {
+ "cost": {
+ "pods_count": 1,
+ "confidence_level": 0.0,
+ "config": {
+ "limits": {
+ "cpu": {
+ "amount": 1.004649523106615,
+ "format": "cores"
+ },
+ "nvidia.com/mig-3g.20gb": {
+ "amount": 1.0,
+ "format": "cores"
+ },
+ "memory": {
+ "amount": 4.9960943616E9,
+ "format": "bytes"
+ }
+ },
+ "requests": {
+ "cpu": {
+ "amount": 1.004649523106615,
+ "format": "cores"
+ },
+ "memory": {
+ "amount": 4.9960943616E9,
+ "format": "bytes"
+ }
+ }
+ },
+ "variation": {
+ "limits": {
+ "cpu": {
+ "amount": -0.995350476893385,
+ "format": "cores"
+ },
+ "memory": {
+ "amount": -3.5938402303999996E9,
+ "format": "bytes"
+ }
+ },
+ "requests": {
+ "cpu": {
+ "amount": 0.004649523106615039,
+ "format": "cores"
+ },
+ "memory": {
+ "amount": -3.5938402303999996E9,
+ "format": "bytes"
+ }
+ }
+ },
+ "notifications": {}
+ },
+ "performance": {
+ "pods_count": 1,
+ "confidence_level": 0.0,
+ "config": {
+ "limits": {
+ "cpu": {
+ "amount": 1.36656145696268,
+ "format": "cores"
+ },
+ "memory": {
+ "amount": 4.9960943616E9,
+ "format": "bytes"
+ },
+ "nvidia.com/mig-4g.20gb": {
+ "amount": 1.0,
+ "format": "cores"
+ }
+ },
+ "requests": {
+ "cpu": {
+ "amount": 1.36656145696268,
+ "format": "cores"
+ },
+ "memory": {
+ "amount": 4.9960943616E9,
+ "format": "bytes"
+ }
+ }
+ },
+ "variation": {
+ "limits": {
+ "cpu": {
+ "amount": -0.63343854303732,
+ "format": "cores"
+ },
+ "memory": {
+ "amount": -3.5938402303999996E9,
+ "format": "bytes"
+ }
+ },
+ "requests": {
+ "cpu": {
+ "amount": 0.36656145696268005,
+ "format": "cores"
+ },
+ "memory": {
+ "amount": -3.5938402303999996E9,
+ "format": "bytes"
+ }
+ }
+ },
+ "notifications": {}
+ }
+ },
+ "plots": {
+ "datapoints": 4,
+ "plots_data": {
+ "2024-10-04T09:16:40.000Z": {
+ "cpuUsage": {
+ "min": 0.005422723351267242,
+ "q1": 1.003281151419465,
+ "median": 1.0118160468783521,
+ "q3": 1.012961901380266,
+ "max": 1.36656145696268,
+ "format": "cores"
+ },
+ "memoryUsage": {
+ "min": 3.68019456E9,
+ "q1": 3.681001472E9,
+ "median": 4.058411008E9,
+ "q3": 4.093308928E9,
+ "max": 4.094062592E9,
+ "format": "bytes"
+ }
+ },
+ "2024-10-04T03:16:40.000Z": {
+ "cpuUsage": {
+ "min": 0.998888009348188,
+ "q1": 1.0029943714818779,
+ "median": 1.0033621837551019,
+ "q3": 1.0040859908301978,
+ "max": 1.0828338199135354,
+ "format": "cores"
+ },
+ "memoryUsage": {
+ "min": 3.679281152E9,
+ "q1": 3.680755712E9,
+ "median": 3.680989184E9,
+ "q3": 3.687673856E9,
+ "max": 4.163411968E9,
+ "format": "bytes"
+ }
+ },
+ "2024-10-03T15:16:40.000Z": {
+ "cpuUsage": {
+ "min": 0.005425605536480822,
+ "q1": 0.006038658069363403,
+ "median": 0.006183237135144752,
+ "q3": 0.006269460195927269,
+ "max": 0.006916437328481231,
+ "format": "cores"
+ },
+ "memoryUsage": {
+ "min": 2.192125952E9,
+ "q1": 2.192388096E9,
+ "median": 2.192388096E9,
+ "q3": 2.192388096E9,
+ "max": 2.19265024E9,
+ "format": "bytes"
+ }
+ },
+ "2024-10-03T21:16:40.000Z": {
+ "cpuUsage": {
+ "min": 0.0052184839046300075,
+ "q1": 0.006229799261227028,
+ "median": 1.0110868114913476,
+ "q3": 1.0124661560983785,
+ "max": 2.3978065580305032,
+ "format": "cores"
+ },
+ "memoryUsage": {
+ "min": 2.118012928E9,
+ "q1": 2.192392192E9,
+ "median": 4.161662976E9,
+ "q3": 4.162850816E9,
+ "max": 4.163170304E9,
+ "format": "bytes"
+ }
+ }
+ }
+ }
+ },
+ "medium_term": {
+ "duration_in_hours": 168.0,
+ "notifications": {
+ "112101": {
+ "type": "info",
+ "message": "Cost Recommendations Available",
+ "code": 112101
+ },
+ "112102": {
+ "type": "info",
+ "message": "Performance Recommendations Available",
+ "code": 112102
+ }
+ },
+ "monitoring_start_time": "2024-09-27T09:16:40.000Z",
+ "recommendation_engines": {
+ "cost": {
+ "pods_count": 1,
+ "confidence_level": 0.0,
+ "config": {
+ "limits": {
+ "cpu": {
+ "amount": 0.015580688959425347,
+ "format": "cores"
+ },
+ "nvidia.com/mig-3g.20gb": {
+ "amount": 1.0,
+ "format": "cores"
+ },
+ "memory": {
+ "amount": 4.9960943616E9,
+ "format": "bytes"
+ }
+ },
+ "requests": {
+ "cpu": {
+ "amount": 0.015580688959425347,
+ "format": "cores"
+ },
+ "memory": {
+ "amount": 4.9960943616E9,
+ "format": "bytes"
+ }
+ }
+ },
+ "variation": {
+ "limits": {
+ "cpu": {
+ "amount": -1.9844193110405746,
+ "format": "cores"
+ },
+ "memory": {
+ "amount": -3.5938402303999996E9,
+ "format": "bytes"
+ }
+ },
+ "requests": {
+ "cpu": {
+ "amount": -0.9844193110405747,
+ "format": "cores"
+ },
+ "memory": {
+ "amount": -3.5938402303999996E9,
+ "format": "bytes"
+ }
+ }
+ },
+ "notifications": {}
+ },
+ "performance": {
+ "pods_count": 1,
+ "confidence_level": 0.0,
+ "config": {
+ "limits": {
+ "cpu": {
+ "amount": 1.025365696933566,
+ "format": "cores"
+ },
+ "memory": {
+ "amount": 4.9960943616E9,
+ "format": "bytes"
+ },
+ "nvidia.com/mig-4g.20gb": {
+ "amount": 1.0,
+ "format": "cores"
+ }
+ },
+ "requests": {
+ "cpu": {
+ "amount": 1.025365696933566,
+ "format": "cores"
+ },
+ "memory": {
+ "amount": 4.9960943616E9,
+ "format": "bytes"
+ }
+ }
+ },
+ "variation": {
+ "limits": {
+ "cpu": {
+ "amount": -0.974634303066434,
+ "format": "cores"
+ },
+ "memory": {
+ "amount": -3.5938402303999996E9,
+ "format": "bytes"
+ }
+ },
+ "requests": {
+ "cpu": {
+ "amount": 0.02536569693356605,
+ "format": "cores"
+ },
+ "memory": {
+ "amount": -3.5938402303999996E9,
+ "format": "bytes"
+ }
+ }
+ },
+ "notifications": {}
+ }
+ },
+ "plots": {
+ "datapoints": 7,
+ "plots_data": {
+ "2024-09-29T09:16:40.000Z": {},
+ "2024-10-04T09:16:40.000Z": {
+ "cpuUsage": {
+ "min": 0.0052184839046300075,
+ "q1": 0.006207971650471658,
+ "median": 1.0032201196711934,
+ "q3": 1.0115567178617741,
+ "max": 2.3978065580305032,
+ "format": "cores"
+ },
+ "memoryUsage": {
+ "min": 2.118012928E9,
+ "q1": 2.192392192E9,
+ "median": 3.6808704E9,
+ "q3": 4.093349888E9,
+ "max": 4.163411968E9,
+ "format": "bytes"
+ }
+ },
+ "2024-09-30T09:16:40.000Z": {},
+ "2024-10-02T09:16:40.000Z": {
+ "cpuUsage": {
+ "min": 0.00554280490421283,
+ "q1": 0.015358846193868379,
+ "median": 0.015705212168337323,
+ "q3": 1.010702281083678,
+ "max": 1.0139464901392594,
+ "format": "cores"
+ },
+ "memoryUsage": {
+ "min": 2.192125952E9,
+ "q1": 2.717663232E9,
+ "median": 2.719612928E9,
+ "q3": 2.719617024E9,
+ "max": 2.720600064E9,
+ "format": "bytes"
+ }
+ },
+ "2024-09-28T09:16:40.000Z": {},
+ "2024-10-03T09:16:40.000Z": {
+ "cpuUsage": {
+ "min": 0.005373319820852367,
+ "q1": 0.006054991034195089,
+ "median": 0.006142447129874265,
+ "q3": 0.006268777122325054,
+ "max": 0.007366566784856696,
+ "format": "cores"
+ },
+ "memoryUsage": {
+ "min": 2.192125952E9,
+ "q1": 2.192388096E9,
+ "median": 2.192388096E9,
+ "q3": 2.192388096E9,
+ "max": 2.192654336E9,
+ "format": "bytes"
+ }
+ },
+ "2024-10-01T09:16:40.000Z": {
+ "cpuUsage": {
+ "min": 0.003319077875529473,
+ "q1": 1.0101034685479167,
+ "median": 1.0118171810142638,
+ "q3": 1.0208974318073034,
+ "max": 3.5577616386258963,
+ "format": "cores"
+ },
+ "memoryUsage": {
+ "min": 1.77057792E8,
+ "q1": 2.64523776E9,
+ "median": 2.651078656E9,
+ "q3": 2.693431296E9,
+ "max": 2.705133568E9,
+ "format": "bytes"
+ }
+ }
+ }
+ }
+ },
+ "long_term": {
+ "duration_in_hours": 360.0,
+ "notifications": {
+ "120001": {
+ "type": "info",
+ "message": "There is not enough data available to generate a recommendation.",
+ "code": 120001
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+ ]
+ }
+ ],
+ "version": "v2.0",
+ "experiment_name": "human_eval_exp"
+ }
+]
+```
+
+
### Invalid Scenarios:
@@ -5049,6 +5549,11 @@ structured and easily interpretable way for users or external systems to access
+
+
+
+
+
---
From 1f35fe8f73e30f6d8088ed875cf5ac4bb0f6ca73 Mon Sep 17 00:00:00 2001
From: msvinaykumar
Date: Fri, 4 Oct 2024 13:58:56 +0530
Subject: [PATCH 25/64] Request Payload and Response Object template
Signed-off-by: msvinaykumar
---
.../analyzer/serviceObjects/BulkInput.java | 121 ++++++++
.../serviceObjects/BulkJobStatus.java | 265 ++++++++++++++++++
2 files changed, 386 insertions(+)
create mode 100644 src/main/java/com/autotune/analyzer/serviceObjects/BulkInput.java
create mode 100644 src/main/java/com/autotune/analyzer/serviceObjects/BulkJobStatus.java
diff --git a/src/main/java/com/autotune/analyzer/serviceObjects/BulkInput.java b/src/main/java/com/autotune/analyzer/serviceObjects/BulkInput.java
new file mode 100644
index 000000000..185b19679
--- /dev/null
+++ b/src/main/java/com/autotune/analyzer/serviceObjects/BulkInput.java
@@ -0,0 +1,121 @@
+package com.autotune.analyzer.serviceObjects;
+
+import java.util.List;
+import java.util.Map;
+
+public class BulkInput {
+ private FilterWrapper filter;
+ private TimeRange time_range;
+ private String datasource;
+
+ // Getters and Setters
+
+ public TimeRange getTime_range() {
+ return time_range;
+ }
+
+ public void setTime_range(TimeRange time_range) {
+ this.time_range = time_range;
+ }
+
+ public String getDatasource() {
+ return datasource;
+ }
+
+ public void setDatasource(String datasource) {
+ this.datasource = datasource;
+ }
+
+ public FilterWrapper getFilter() {
+ return filter;
+ }
+
+ public void setFilter(FilterWrapper filter) {
+ this.filter = filter;
+ }
+
+ // Nested class for FilterWrapper that contains 'exclude' and 'include'
+ public static class FilterWrapper {
+ private Filter exclude;
+ private Filter include;
+
+ // Getters and Setters
+ public Filter getExclude() {
+ return exclude;
+ }
+
+ public void setExclude(Filter exclude) {
+ this.exclude = exclude;
+ }
+
+ public Filter getInclude() {
+ return include;
+ }
+
+ public void setInclude(Filter include) {
+ this.include = include;
+ }
+ }
+
+ public static class Filter {
+ private List namespace;
+ private List workload;
+ private List containers;
+ private Map labels;
+
+ // Getters and Setters
+ public List getNamespace() {
+ return namespace;
+ }
+
+ public void setNamespace(List namespace) {
+ this.namespace = namespace;
+ }
+
+ public List getWorkload() {
+ return workload;
+ }
+
+ public void setWorkload(List workload) {
+ this.workload = workload;
+ }
+
+ public List getContainers() {
+ return containers;
+ }
+
+ public void setContainers(List containers) {
+ this.containers = containers;
+ }
+
+ public Map getLabels() {
+ return labels;
+ }
+
+ public void setLabels(Map labels) {
+ this.labels = labels;
+ }
+ }
+
+ public static class TimeRange {
+ private String start;
+ private String end;
+
+ // Getters and Setters
+ public String getStart() {
+ return start;
+ }
+
+ public void setStart(String start) {
+ this.start = start;
+ }
+
+ public String getEnd() {
+ return end;
+ }
+
+ public void setEnd(String end) {
+ this.end = end;
+ }
+ }
+}
diff --git a/src/main/java/com/autotune/analyzer/serviceObjects/BulkJobStatus.java b/src/main/java/com/autotune/analyzer/serviceObjects/BulkJobStatus.java
new file mode 100644
index 000000000..fe4b313a7
--- /dev/null
+++ b/src/main/java/com/autotune/analyzer/serviceObjects/BulkJobStatus.java
@@ -0,0 +1,265 @@
+package com.autotune.analyzer.serviceObjects;
+
+import com.fasterxml.jackson.annotation.JsonProperty;
+
+import java.time.Instant;
+import java.time.ZoneOffset;
+import java.time.format.DateTimeFormatter;
+import java.util.List;
+
+public class BulkJobStatus {
+ private String jobID;
+ private String status;
+ private int progress;
+ private Data data;
+ @JsonProperty("start_time")
+ private String startTime; // Change to String to store formatted time
+ @JsonProperty("end_time")
+ private String endTime; // Change to String to store formatted time
+
+ public BulkJobStatus(String jobID, String status, int progress, Data data, Instant startTime) {
+ this.jobID = jobID;
+ this.status = status;
+ this.progress = progress;
+ this.data = data;
+ setStartTime(startTime);
+ }
+
+ public String getJobID() {
+ return jobID;
+ }
+
+ public String getStatus() {
+ return status;
+ }
+
+ public void setStatus(String status) {
+ this.status = status;
+ }
+
+ public int getProgress() {
+ return progress;
+ }
+
+ public void setProgress(int progress) {
+ this.progress = progress;
+ }
+
+ public Data getData() {
+ return data;
+ }
+
+ public void setData(Data data) {
+ this.data = data;
+ }
+
+ public String getStartTime() {
+ return startTime;
+ }
+
+ public void setStartTime(Instant startTime) {
+ this.startTime = formatInstantAsUTCString(startTime);
+ }
+
+ public String getEndTime() {
+ return endTime;
+ }
+
+ public void setEndTime(Instant endTime) {
+ this.endTime = formatInstantAsUTCString(endTime);
+ }
+
+ // Utility function to format Instant into the required UTC format
+ private String formatInstantAsUTCString(Instant instant) {
+ DateTimeFormatter formatter = DateTimeFormatter
+ .ofPattern("yyyy-MM-dd'T'HH:mm:ss.SSS'Z'")
+ .withZone(ZoneOffset.UTC); // Ensure it's in UTC
+
+ return formatter.format(instant);
+ }
+
+ // Inner class for the data field
+ public static class Data {
+ private Experiments experiments;
+ private Recommendations recommendations;
+
+ public Data(Experiments experiments, Recommendations recommendations) {
+ this.experiments = experiments;
+ this.recommendations = recommendations;
+ }
+
+ public Experiments getExperiments() {
+ return experiments;
+ }
+
+ public void setExperiments(Experiments experiments) {
+ this.experiments = experiments;
+ }
+
+ public Recommendations getRecommendations() {
+ return recommendations;
+ }
+
+ public void setRecommendations(Recommendations recommendations) {
+ this.recommendations = recommendations;
+ }
+ }
+
+ // Inner class for experiments
+ public static class Experiments {
+ @JsonProperty("new")
+ private List newExperiments;
+ @JsonProperty("updated")
+ private List updatedExperiments;
+ @JsonProperty("failed")
+ private List failedExperiments;
+
+ public Experiments(List newExperiments, List updatedExperiments) {
+ this.newExperiments = newExperiments;
+ this.updatedExperiments = updatedExperiments;
+ }
+
+ public List getNewExperiments() {
+ return newExperiments;
+ }
+
+ public void setNewExperiments(List newExperiments) {
+ this.newExperiments = newExperiments;
+ }
+
+ public List getUpdatedExperiments() {
+ return updatedExperiments;
+ }
+
+ public void setUpdatedExperiments(List updatedExperiments) {
+ this.updatedExperiments = updatedExperiments;
+ }
+ }
+
+ // Inner class for recommendations
+ public static class Recommendations {
+ @JsonProperty("count")
+ private int totalCount;
+ @JsonProperty("completed")
+ private int completedCount;
+ private RecommendationData data;
+
+ public Recommendations(int totalCount, int completedCount, RecommendationData data) {
+ this.totalCount = totalCount;
+ this.completedCount = completedCount;
+ this.data = data;
+ }
+
+ public int getTotalCount() {
+ return totalCount;
+ }
+
+ public void setTotalCount(int totalCount) {
+ this.totalCount = totalCount;
+ }
+
+ public int getCompletedCount() {
+ return this.data.getCompleted().size();
+ }
+
+ public void setCompletedCount(int completedCount) {
+ this.completedCount = completedCount;
+ }
+
+ public RecommendationData getData() {
+ return data;
+ }
+
+ public void setData(RecommendationData data) {
+ this.data = data;
+ }
+ }
+
+ // Inner class for recommendation data
+ public static class RecommendationData {
+ private List completed;
+ private List progress;
+ private List inqueue;
+ private List failed;
+
+ public RecommendationData(List completed, List progress, List inqueue, List failed) {
+ this.completed = completed;
+ this.progress = progress;
+ this.inqueue = inqueue;
+ this.failed = failed;
+ }
+
+ public List getCompleted() {
+ return completed;
+ }
+
+ public void setCompleted(List completed) {
+ this.completed = completed;
+ }
+
+ public List getProgress() {
+ return progress;
+ }
+
+ public void setProgress(List progress) {
+ this.progress = progress;
+ }
+
+ public List getInqueue() {
+ return inqueue;
+ }
+
+ public void setInqueue(List inqueue) {
+ this.inqueue = inqueue;
+ }
+
+ public List getFailed() {
+ return failed;
+ }
+
+ public void setFailed(List failed) {
+ this.failed = failed;
+ }
+
+ // Move elements from inqueue to progress
+ public void moveToProgress(String element) {
+ if (inqueue.contains(element)) {
+ inqueue.remove(element);
+ if (!progress.contains(element)) {
+ progress.add(element);
+ }
+ }
+ }
+
+ // Move elements from progress to completed
+ public void moveToCompleted(String element) {
+ if (progress.contains(element)) {
+ progress.remove(element);
+ if (!completed.contains(element)) {
+ completed.add(element);
+ }
+ }
+ }
+
+ // Move elements from progress to failed
+ public void moveToFailed(String element) {
+ if (progress.contains(element)) {
+ progress.remove(element);
+ if (!failed.contains(element)) {
+ failed.add(element);
+ }
+ }
+ }
+
+ // Calculate the percentage of completion
+ public int completionPercentage() {
+ int totalTasks = completed.size() + progress.size() + inqueue.size() + failed.size();
+ if (totalTasks == 0) {
+ return (int) 0.0;
+ }
+ return (int) ((completed.size() * 100.0) / totalTasks);
+ }
+ }
+
+
+}
From 564baa082f8853dd77e699af995709fdfd4ad54e Mon Sep 17 00:00:00 2001
From: msvinaykumar
Date: Fri, 4 Oct 2024 16:39:41 +0530
Subject: [PATCH 26/64] 3. Bulk Service to handle Request and Response
Signed-off-by: msvinaykumar
---
.../analyzer/services/BulkService.java | 126 ++++++++++++++++++
1 file changed, 126 insertions(+)
create mode 100644 src/main/java/com/autotune/analyzer/services/BulkService.java
diff --git a/src/main/java/com/autotune/analyzer/services/BulkService.java b/src/main/java/com/autotune/analyzer/services/BulkService.java
new file mode 100644
index 000000000..35266b20a
--- /dev/null
+++ b/src/main/java/com/autotune/analyzer/services/BulkService.java
@@ -0,0 +1,126 @@
+/*******************************************************************************
+ * Copyright (c) 2022 Red Hat, IBM Corporation and others.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *******************************************************************************/
+package com.autotune.analyzer.services;
+
+import com.autotune.analyzer.serviceObjects.BulkInput;
+import com.autotune.analyzer.serviceObjects.BulkJobStatus;
+import com.autotune.analyzer.workerimpl.BulkJobManager;
+import com.fasterxml.jackson.databind.ObjectMapper;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import javax.servlet.ServletConfig;
+import javax.servlet.ServletException;
+import javax.servlet.annotation.WebServlet;
+import javax.servlet.http.HttpServlet;
+import javax.servlet.http.HttpServletRequest;
+import javax.servlet.http.HttpServletResponse;
+import java.io.IOException;
+import java.time.Instant;
+import java.util.ArrayList;
+import java.util.Map;
+import java.util.UUID;
+import java.util.concurrent.ConcurrentHashMap;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Executors;
+
+/**
+ *
+ */
+@WebServlet(asyncSupported = true)
+public class BulkService extends HttpServlet {
+ private static final long serialVersionUID = 1L;
+ private static final Logger LOGGER = LoggerFactory.getLogger(BulkService.class);
+ private ExecutorService executorService = Executors.newFixedThreadPool(10);
+ private Map jobStatusMap = new ConcurrentHashMap<>();
+
+ @Override
+ public void init(ServletConfig config) throws ServletException {
+ super.init(config);
+ }
+
+ /**
+ * @param req
+ * @param resp
+ * @throws ServletException
+ * @throws IOException
+ */
+ @Override
+ protected void doGet(HttpServletRequest req, HttpServletResponse resp) throws ServletException, IOException {
+ String jobID = req.getParameter("jobID");
+ BulkJobStatus jobStatus = jobStatusMap.get(jobID);
+
+ if (jobStatus == null) {
+ resp.setStatus(HttpServletResponse.SC_NOT_FOUND);
+ resp.getWriter().write("{\"error\":\"Job not found\"}");
+ } else {
+ try {
+ resp.setContentType("application/json");
+ resp.setCharacterEncoding("UTF-8");
+ // Return the JSON representation of the JobStatus object
+ ObjectMapper objectMapper = new ObjectMapper();
+ String jsonResponse = objectMapper.writeValueAsString(jobStatus);
+ resp.getWriter().write(jsonResponse);
+ } catch (Exception e) {
+ e.printStackTrace();
+ }
+ }
+ }
+
+ /**
+ * @param request
+ * @param response
+ * @throws ServletException
+ * @throws IOException
+ */
+ @Override
+ protected void doPost(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException {
+ // Set response type
+ response.setContentType("application/json");
+
+ // Create ObjectMapper instance
+ ObjectMapper objectMapper = new ObjectMapper();
+
+ // Read the request payload and map to RequestPayload class
+ BulkInput payload = objectMapper.readValue(request.getInputStream(), BulkInput.class);
+
+ // Generate a unique jobID
+ String jobID = UUID.randomUUID().toString();
+ BulkJobStatus.Data data = new BulkJobStatus.Data(
+ new BulkJobStatus.Experiments(new ArrayList<>(), new ArrayList<>()),
+ new BulkJobStatus.Recommendations(0, 0, new BulkJobStatus.RecommendationData(
+ new ArrayList<>(),
+ new ArrayList<>(),
+ new ArrayList<>(),
+ new ArrayList<>()
+ ))
+ );
+ jobStatusMap.put(jobID, new BulkJobStatus(jobID, "IN_PROGRESS", 0, data, Instant.now()));
+ // Submit the job to be processed asynchronously
+ // TODO example : executorService.submit(new BulkJobManager(jobID, jobStatusMap, payload));
+
+ // Just sending a simple success response back
+ // Return the jobID to the user
+ response.setContentType("application/json");
+ response.getWriter().write("{\"jobID\":\"" + jobID + "\"}");
+ }
+
+
+ @Override
+ public void destroy() {
+ executorService.shutdown();
+ }
+}
From c999e278cf2a143a8f86a2067f200feb58401a70 Mon Sep 17 00:00:00 2001
From: msvinaykumar
Date: Mon, 7 Oct 2024 12:51:35 +0530
Subject: [PATCH 27/64] renamed jobID to job_id
Signed-off-by: msvinaykumar
---
.../java/com/autotune/analyzer/serviceObjects/BulkJobStatus.java | 1 +
1 file changed, 1 insertion(+)
diff --git a/src/main/java/com/autotune/analyzer/serviceObjects/BulkJobStatus.java b/src/main/java/com/autotune/analyzer/serviceObjects/BulkJobStatus.java
index fe4b313a7..c04684321 100644
--- a/src/main/java/com/autotune/analyzer/serviceObjects/BulkJobStatus.java
+++ b/src/main/java/com/autotune/analyzer/serviceObjects/BulkJobStatus.java
@@ -8,6 +8,7 @@
import java.util.List;
public class BulkJobStatus {
+ @JsonProperty("job_id")
private String jobID;
private String status;
private int progress;
From 10b06cfd566ad2e4eeac23a288f5b79f4caf0f3a Mon Sep 17 00:00:00 2001
From: msvinaykumar
Date: Fri, 4 Oct 2024 16:52:21 +0530
Subject: [PATCH 28/64] 3. Bulk Service to handle Request and Response
Signed-off-by: msvinaykumar
---
src/main/java/com/autotune/analyzer/Analyzer.java | 1 +
src/main/java/com/autotune/analyzer/services/BulkService.java | 3 +--
src/main/java/com/autotune/utils/ServerContext.java | 3 +++
3 files changed, 5 insertions(+), 2 deletions(-)
diff --git a/src/main/java/com/autotune/analyzer/Analyzer.java b/src/main/java/com/autotune/analyzer/Analyzer.java
index 9ebf49199..0c2cea55b 100644
--- a/src/main/java/com/autotune/analyzer/Analyzer.java
+++ b/src/main/java/com/autotune/analyzer/Analyzer.java
@@ -58,6 +58,7 @@ public static void addServlets(ServletContextHandler context) {
context.addServlet(MetricProfileService.class, ServerContext.DELETE_METRIC_PROFILE);
context.addServlet(ListDatasources.class, ServerContext.LIST_DATASOURCES);
context.addServlet(DSMetadataService.class, ServerContext.DATASOURCE_METADATA);
+ context.addServlet(BulkService.class, ServerContext.BULK_SERVICE);
// Adding UI support API's
context.addServlet(ListNamespaces.class, ServerContext.LIST_NAMESPACES);
diff --git a/src/main/java/com/autotune/analyzer/services/BulkService.java b/src/main/java/com/autotune/analyzer/services/BulkService.java
index 35266b20a..104dfb5d3 100644
--- a/src/main/java/com/autotune/analyzer/services/BulkService.java
+++ b/src/main/java/com/autotune/analyzer/services/BulkService.java
@@ -17,7 +17,6 @@
import com.autotune.analyzer.serviceObjects.BulkInput;
import com.autotune.analyzer.serviceObjects.BulkJobStatus;
-import com.autotune.analyzer.workerimpl.BulkJobManager;
import com.fasterxml.jackson.databind.ObjectMapper;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -110,7 +109,7 @@ protected void doPost(HttpServletRequest request, HttpServletResponse response)
);
jobStatusMap.put(jobID, new BulkJobStatus(jobID, "IN_PROGRESS", 0, data, Instant.now()));
// Submit the job to be processed asynchronously
- // TODO example : executorService.submit(new BulkJobManager(jobID, jobStatusMap, payload));
+ // example executorService.submit(new BulkJobManager(jobID, jobStatusMap, payload));
// Just sending a simple success response back
// Return the jobID to the user
diff --git a/src/main/java/com/autotune/utils/ServerContext.java b/src/main/java/com/autotune/utils/ServerContext.java
index 2c95d3efe..eac7f6079 100644
--- a/src/main/java/com/autotune/utils/ServerContext.java
+++ b/src/main/java/com/autotune/utils/ServerContext.java
@@ -75,4 +75,7 @@ public class ServerContext {
public static final String LIST_NAMESPACES = QUERY_CONTEXT + "listNamespaces";
public static final String LIST_DEPLOYMENTS = QUERY_CONTEXT + "listDeployments";
public static final String LIST_K8S_OBJECTS = QUERY_CONTEXT + "listK8sObjects";
+
+ //Bulk Service
+ public static final String BULK_SERVICE = ROOT_CONTEXT + "bulk";
}
From cb26e4695361bc396f0c5a12491fa01f65cd004e Mon Sep 17 00:00:00 2001
From: msvinaykumar
Date: Mon, 7 Oct 2024 12:56:20 +0530
Subject: [PATCH 29/64] renamed jobID to job_id
Signed-off-by: msvinaykumar
---
src/main/java/com/autotune/analyzer/services/BulkService.java | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/src/main/java/com/autotune/analyzer/services/BulkService.java b/src/main/java/com/autotune/analyzer/services/BulkService.java
index 104dfb5d3..5841f03ec 100644
--- a/src/main/java/com/autotune/analyzer/services/BulkService.java
+++ b/src/main/java/com/autotune/analyzer/services/BulkService.java
@@ -59,7 +59,7 @@ public void init(ServletConfig config) throws ServletException {
*/
@Override
protected void doGet(HttpServletRequest req, HttpServletResponse resp) throws ServletException, IOException {
- String jobID = req.getParameter("jobID");
+ String jobID = req.getParameter("job_id");
BulkJobStatus jobStatus = jobStatusMap.get(jobID);
if (jobStatus == null) {
@@ -114,7 +114,7 @@ protected void doPost(HttpServletRequest request, HttpServletResponse response)
// Just sending a simple success response back
// Return the jobID to the user
response.setContentType("application/json");
- response.getWriter().write("{\"jobID\":\"" + jobID + "\"}");
+ response.getWriter().write("{\"job_id\":\"" + jobID + "\"}");
}
From 920a9cbf483a95e21d14d94fe0974d754b175604 Mon Sep 17 00:00:00 2001
From: msvinaykumar
Date: Mon, 7 Oct 2024 20:11:22 +0530
Subject: [PATCH 30/64] incorporated review comments
Signed-off-by: msvinaykumar
---
.../analyzer/serviceObjects/BulkInput.java | 18 +++++++++++
.../serviceObjects/BulkJobStatus.java | 22 +++++++++++++-
.../analyzer/services/BulkService.java | 30 +++++++++++++------
.../com/autotune/utils/KruizeConstants.java | 15 ++++++++++
4 files changed, 75 insertions(+), 10 deletions(-)
diff --git a/src/main/java/com/autotune/analyzer/serviceObjects/BulkInput.java b/src/main/java/com/autotune/analyzer/serviceObjects/BulkInput.java
index 185b19679..e5e31d40d 100644
--- a/src/main/java/com/autotune/analyzer/serviceObjects/BulkInput.java
+++ b/src/main/java/com/autotune/analyzer/serviceObjects/BulkInput.java
@@ -1,8 +1,26 @@
+/*******************************************************************************
+ * Copyright (c) 2022 Red Hat, IBM Corporation and others.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *******************************************************************************/
package com.autotune.analyzer.serviceObjects;
import java.util.List;
import java.util.Map;
+/**
+ * Request payload object for Bulk Api service
+ */
public class BulkInput {
private FilterWrapper filter;
private TimeRange time_range;
diff --git a/src/main/java/com/autotune/analyzer/serviceObjects/BulkJobStatus.java b/src/main/java/com/autotune/analyzer/serviceObjects/BulkJobStatus.java
index c04684321..17b0d787c 100644
--- a/src/main/java/com/autotune/analyzer/serviceObjects/BulkJobStatus.java
+++ b/src/main/java/com/autotune/analyzer/serviceObjects/BulkJobStatus.java
@@ -1,3 +1,18 @@
+/*******************************************************************************
+ * Copyright (c) 2022 Red Hat, IBM Corporation and others.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *******************************************************************************/
package com.autotune.analyzer.serviceObjects;
import com.fasterxml.jackson.annotation.JsonProperty;
@@ -7,8 +22,13 @@
import java.time.format.DateTimeFormatter;
import java.util.List;
+import static com.autotune.utils.KruizeConstants.KRUIZE_BULK_API.JOB_ID;
+
+/**
+ * Bulk API Response payload Object.
+ */
public class BulkJobStatus {
- @JsonProperty("job_id")
+ @JsonProperty(JOB_ID)
private String jobID;
private String status;
private int progress;
diff --git a/src/main/java/com/autotune/analyzer/services/BulkService.java b/src/main/java/com/autotune/analyzer/services/BulkService.java
index 5841f03ec..2b69a4af0 100644
--- a/src/main/java/com/autotune/analyzer/services/BulkService.java
+++ b/src/main/java/com/autotune/analyzer/services/BulkService.java
@@ -18,6 +18,7 @@
import com.autotune.analyzer.serviceObjects.BulkInput;
import com.autotune.analyzer.serviceObjects.BulkJobStatus;
import com.fasterxml.jackson.databind.ObjectMapper;
+import org.json.JSONObject;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -36,6 +37,10 @@
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
+import static com.autotune.analyzer.utils.AnalyzerConstants.ServiceConstants.CHARACTER_ENCODING;
+import static com.autotune.analyzer.utils.AnalyzerConstants.ServiceConstants.JSON_CONTENT_TYPE;
+import static com.autotune.utils.KruizeConstants.KRUIZE_BULK_API.*;
+
/**
*
*/
@@ -59,16 +64,19 @@ public void init(ServletConfig config) throws ServletException {
*/
@Override
protected void doGet(HttpServletRequest req, HttpServletResponse resp) throws ServletException, IOException {
- String jobID = req.getParameter("job_id");
+ String jobID = req.getParameter(JOB_ID);
BulkJobStatus jobStatus = jobStatusMap.get(jobID);
+ resp.setContentType(JSON_CONTENT_TYPE);
+ resp.setCharacterEncoding(CHARACTER_ENCODING);
if (jobStatus == null) {
resp.setStatus(HttpServletResponse.SC_NOT_FOUND);
- resp.getWriter().write("{\"error\":\"Job not found\"}");
+ JSONObject jsonObject = new JSONObject();
+ jsonObject.put(ERROR, JOB_NOT_FOUND_MSG);
+ resp.getWriter().write(jsonObject.toString());
} else {
try {
- resp.setContentType("application/json");
- resp.setCharacterEncoding("UTF-8");
+ resp.setStatus(HttpServletResponse.SC_OK);
// Return the JSON representation of the JobStatus object
ObjectMapper objectMapper = new ObjectMapper();
String jsonResponse = objectMapper.writeValueAsString(jobStatus);
@@ -88,7 +96,8 @@ protected void doGet(HttpServletRequest req, HttpServletResponse resp) throws Se
@Override
protected void doPost(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException {
// Set response type
- response.setContentType("application/json");
+ response.setContentType(JSON_CONTENT_TYPE);
+ response.setCharacterEncoding(CHARACTER_ENCODING);
// Create ObjectMapper instance
ObjectMapper objectMapper = new ObjectMapper();
@@ -107,14 +116,15 @@ protected void doPost(HttpServletRequest request, HttpServletResponse response)
new ArrayList<>()
))
);
- jobStatusMap.put(jobID, new BulkJobStatus(jobID, "IN_PROGRESS", 0, data, Instant.now()));
+ jobStatusMap.put(jobID, new BulkJobStatus(jobID, IN_PROGRESS, 0, data, Instant.now()));
// Submit the job to be processed asynchronously
- // example executorService.submit(new BulkJobManager(jobID, jobStatusMap, payload));
+ executorService.submit(new BulkJobManager(jobID, jobStatusMap, payload));
// Just sending a simple success response back
// Return the jobID to the user
- response.setContentType("application/json");
- response.getWriter().write("{\"job_id\":\"" + jobID + "\"}");
+ JSONObject jsonObject = new JSONObject();
+ jsonObject.put(JOB_ID, jobID);
+ response.getWriter().write(jsonObject.toString());
}
@@ -122,4 +132,6 @@ protected void doPost(HttpServletRequest request, HttpServletResponse response)
public void destroy() {
executorService.shutdown();
}
+
+
}
diff --git a/src/main/java/com/autotune/utils/KruizeConstants.java b/src/main/java/com/autotune/utils/KruizeConstants.java
index 62dd08247..d44ffdd16 100644
--- a/src/main/java/com/autotune/utils/KruizeConstants.java
+++ b/src/main/java/com/autotune/utils/KruizeConstants.java
@@ -412,6 +412,7 @@ private DataSourceConstants() {
public static class DataSourceDetailsInfoConstants {
public static final String version = "v1.0";
public static final String CLUSTER_NAME = "default";
+
private DataSourceDetailsInfoConstants() {
}
}
@@ -453,6 +454,7 @@ public static class DataSourceErrorMsgs {
public static final String ENDPOINT_NOT_FOUND = "Service endpoint not found.";
public static final String MISSING_DATASOURCE_INFO = "Datasource is missing, add a valid Datasource";
public static final String INVALID_DATASOURCE_INFO = "Datasource is either missing or is invalid";
+
private DataSourceErrorMsgs() {
}
}
@@ -464,6 +466,7 @@ public static class DataSourceQueryJSONKeys {
public static final String METRIC = "metric";
public static final String VALUE = "value";
public static final String VALUES = "values";
+
private DataSourceQueryJSONKeys() {
}
@@ -472,6 +475,7 @@ private DataSourceQueryJSONKeys() {
public static class DataSourceQueryStatus {
public static final String SUCCESS = "success";
public static final String ERROR = "error";
+
private DataSourceQueryStatus() {
}
}
@@ -482,6 +486,7 @@ public static class DataSourceQueryMetricKeys {
public static final String WORKLOAD_TYPE = "workload_type";
public static final String CONTAINER_NAME = "container";
public static final String CONTAINER_IMAGE_NAME = "image";
+
private DataSourceQueryMetricKeys() {
}
}
@@ -489,6 +494,7 @@ private DataSourceQueryMetricKeys() {
public static class DataSourceMetadataInfoConstants {
public static final String version = "v1.0";
public static final String CLUSTER_NAME = "default";
+
private DataSourceMetadataInfoConstants() {
}
}
@@ -525,6 +531,7 @@ public static class DataSourceMetadataErrorMsgs {
public static final String DATASOURCE_METADATA_VALIDATION_FAILURE_MSG = "Validation of imported metadata failed, mandatory fields missing: %s";
public static final String NAMESPACE_QUERY_VALIDATION_FAILED = "Validation failed for namespace data query.";
public static final String DATASOURCE_OPERATOR_RETRIEVAL_FAILURE = "Failed to retrieve data source operator for provider: %s";
+
private DataSourceMetadataErrorMsgs() {
}
}
@@ -542,6 +549,7 @@ public static class DataSourceMetadataInfoJSONKeys {
public static final String CONTAINERS = "containers";
public static final String CONTAINER_NAME = "container_name";
public static final String CONTAINER_IMAGE_NAME = "container_image_name";
+
private DataSourceMetadataInfoJSONKeys() {
}
}
@@ -753,4 +761,11 @@ public static final class AuthenticationConstants {
public static final String AUTHORIZATION = "Authorization";
}
+
+ public static final class KRUIZE_BULK_API {
+ public static final String JOB_ID = "job_id";
+ public static final String ERROR = "error";
+ public static final String JOB_NOT_FOUND_MSG = "Job not found";
+ public static final String IN_PROGRESS = "IN_PROGRESS";
+ }
}
From ffbc8f894fc3a3cd7a28c986f24388b40bd99f06 Mon Sep 17 00:00:00 2001
From: msvinaykumar
Date: Tue, 8 Oct 2024 15:28:12 +0530
Subject: [PATCH 31/64] resolved pr check
Signed-off-by: msvinaykumar
---
src/main/java/com/autotune/analyzer/services/BulkService.java | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/src/main/java/com/autotune/analyzer/services/BulkService.java b/src/main/java/com/autotune/analyzer/services/BulkService.java
index 2b69a4af0..c413a0080 100644
--- a/src/main/java/com/autotune/analyzer/services/BulkService.java
+++ b/src/main/java/com/autotune/analyzer/services/BulkService.java
@@ -118,7 +118,7 @@ protected void doPost(HttpServletRequest request, HttpServletResponse response)
);
jobStatusMap.put(jobID, new BulkJobStatus(jobID, IN_PROGRESS, 0, data, Instant.now()));
// Submit the job to be processed asynchronously
- executorService.submit(new BulkJobManager(jobID, jobStatusMap, payload));
+ // executorService.submit(new BulkJobManager(jobID, jobStatusMap, payload));
// Just sending a simple success response back
// Return the jobID to the user
From 15cf9c3f2cac6eec28cfacad65e77996bd9146bf Mon Sep 17 00:00:00 2001
From: msvinaykumar
Date: Mon, 7 Oct 2024 20:18:31 +0530
Subject: [PATCH 32/64] incorporated review comments
Signed-off-by: msvinaykumar
---
src/main/java/com/autotune/analyzer/services/BulkService.java | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/src/main/java/com/autotune/analyzer/services/BulkService.java b/src/main/java/com/autotune/analyzer/services/BulkService.java
index c413a0080..2b69a4af0 100644
--- a/src/main/java/com/autotune/analyzer/services/BulkService.java
+++ b/src/main/java/com/autotune/analyzer/services/BulkService.java
@@ -118,7 +118,7 @@ protected void doPost(HttpServletRequest request, HttpServletResponse response)
);
jobStatusMap.put(jobID, new BulkJobStatus(jobID, IN_PROGRESS, 0, data, Instant.now()));
// Submit the job to be processed asynchronously
- // executorService.submit(new BulkJobManager(jobID, jobStatusMap, payload));
+ executorService.submit(new BulkJobManager(jobID, jobStatusMap, payload));
// Just sending a simple success response back
// Return the jobID to the user
From ee8b7ddb1d3c5546fff6274c90ee09ce86cf84e8 Mon Sep 17 00:00:00 2001
From: msvinaykumar
Date: Mon, 7 Oct 2024 20:47:44 +0530
Subject: [PATCH 33/64] incorporated review comments
Signed-off-by: msvinaykumar
---
.../analyzer/services/BulkService.java | 20 ++++++++++++++-----
1 file changed, 15 insertions(+), 5 deletions(-)
diff --git a/src/main/java/com/autotune/analyzer/services/BulkService.java b/src/main/java/com/autotune/analyzer/services/BulkService.java
index 2b69a4af0..888d0145e 100644
--- a/src/main/java/com/autotune/analyzer/services/BulkService.java
+++ b/src/main/java/com/autotune/analyzer/services/BulkService.java
@@ -70,10 +70,12 @@ protected void doGet(HttpServletRequest req, HttpServletResponse resp) throws Se
resp.setCharacterEncoding(CHARACTER_ENCODING);
if (jobStatus == null) {
- resp.setStatus(HttpServletResponse.SC_NOT_FOUND);
- JSONObject jsonObject = new JSONObject();
- jsonObject.put(ERROR, JOB_NOT_FOUND_MSG);
- resp.getWriter().write(jsonObject.toString());
+ sendErrorResponse(
+ resp,
+ null,
+ HttpServletResponse.SC_NOT_FOUND,
+ JOB_NOT_FOUND_MSG
+ );
} else {
try {
resp.setStatus(HttpServletResponse.SC_OK);
@@ -133,5 +135,13 @@ public void destroy() {
executorService.shutdown();
}
-
+ public void sendErrorResponse(HttpServletResponse response, Exception e, int httpStatusCode, String errorMsg) throws
+ IOException {
+ if (null != e) {
+ LOGGER.error(e.toString());
+ e.printStackTrace();
+ if (null == errorMsg) errorMsg = e.getMessage();
+ }
+ response.sendError(httpStatusCode, errorMsg);
+ }
}
From 0c43ad3714733fc68eaa389397a013f4c0e95b14 Mon Sep 17 00:00:00 2001
From: msvinaykumar
Date: Tue, 8 Oct 2024 15:32:39 +0530
Subject: [PATCH 34/64] resolved pr check
Signed-off-by: msvinaykumar
---
src/main/java/com/autotune/analyzer/services/BulkService.java | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/src/main/java/com/autotune/analyzer/services/BulkService.java b/src/main/java/com/autotune/analyzer/services/BulkService.java
index 888d0145e..2591d70f4 100644
--- a/src/main/java/com/autotune/analyzer/services/BulkService.java
+++ b/src/main/java/com/autotune/analyzer/services/BulkService.java
@@ -120,7 +120,7 @@ protected void doPost(HttpServletRequest request, HttpServletResponse response)
);
jobStatusMap.put(jobID, new BulkJobStatus(jobID, IN_PROGRESS, 0, data, Instant.now()));
// Submit the job to be processed asynchronously
- executorService.submit(new BulkJobManager(jobID, jobStatusMap, payload));
+ // executorService.submit(new BulkJobManager(jobID, jobStatusMap, payload));
// Just sending a simple success response back
// Return the jobID to the user
From 53f47326927371cd08aced3013fdcd583fad2144 Mon Sep 17 00:00:00 2001
From: msvinaykumar
Date: Mon, 7 Oct 2024 20:18:31 +0530
Subject: [PATCH 35/64] incorporated review comments
Signed-off-by: msvinaykumar
---
src/main/java/com/autotune/analyzer/services/BulkService.java | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/src/main/java/com/autotune/analyzer/services/BulkService.java b/src/main/java/com/autotune/analyzer/services/BulkService.java
index 2591d70f4..888d0145e 100644
--- a/src/main/java/com/autotune/analyzer/services/BulkService.java
+++ b/src/main/java/com/autotune/analyzer/services/BulkService.java
@@ -120,7 +120,7 @@ protected void doPost(HttpServletRequest request, HttpServletResponse response)
);
jobStatusMap.put(jobID, new BulkJobStatus(jobID, IN_PROGRESS, 0, data, Instant.now()));
// Submit the job to be processed asynchronously
- // executorService.submit(new BulkJobManager(jobID, jobStatusMap, payload));
+ executorService.submit(new BulkJobManager(jobID, jobStatusMap, payload));
// Just sending a simple success response back
// Return the jobID to the user
From a4bec2ee7f36cb677c992939598dba36d3fe58da Mon Sep 17 00:00:00 2001
From: msvinaykumar
Date: Fri, 4 Oct 2024 16:52:21 +0530
Subject: [PATCH 36/64] 3. Bulk Service to handle Request and Response
Signed-off-by: msvinaykumar
---
src/main/java/com/autotune/analyzer/services/BulkService.java | 1 +
1 file changed, 1 insertion(+)
diff --git a/src/main/java/com/autotune/analyzer/services/BulkService.java b/src/main/java/com/autotune/analyzer/services/BulkService.java
index 888d0145e..40325bcce 100644
--- a/src/main/java/com/autotune/analyzer/services/BulkService.java
+++ b/src/main/java/com/autotune/analyzer/services/BulkService.java
@@ -17,6 +17,7 @@
import com.autotune.analyzer.serviceObjects.BulkInput;
import com.autotune.analyzer.serviceObjects.BulkJobStatus;
+import com.autotune.analyzer.workerimpl.BulkJobManager;
import com.fasterxml.jackson.databind.ObjectMapper;
import org.json.JSONObject;
import org.slf4j.Logger;
From fd4b1b55c9b96da209f39a4d999f3c932dd86c88 Mon Sep 17 00:00:00 2001
From: msvinaykumar
Date: Fri, 4 Oct 2024 17:09:35 +0530
Subject: [PATCH 37/64] 4. Bulk Job manager
Signed-off-by: msvinaykumar
---
.../serviceObjects/KubernetesAPIObject.java | 12 +
.../analyzer/workerimpl/BulkJobManager.java | 229 ++++++++++++++++++
.../operator/KruizeDeploymentInfo.java | 1 +
.../com/autotune/utils/KruizeConstants.java | 1 +
4 files changed, 243 insertions(+)
create mode 100644 src/main/java/com/autotune/analyzer/workerimpl/BulkJobManager.java
diff --git a/src/main/java/com/autotune/analyzer/serviceObjects/KubernetesAPIObject.java b/src/main/java/com/autotune/analyzer/serviceObjects/KubernetesAPIObject.java
index 0a6d52ecf..d24cc3638 100644
--- a/src/main/java/com/autotune/analyzer/serviceObjects/KubernetesAPIObject.java
+++ b/src/main/java/com/autotune/analyzer/serviceObjects/KubernetesAPIObject.java
@@ -49,14 +49,26 @@ public String getType() {
return type;
}
+ public void setType(String type) {
+ this.type = type;
+ }
+
public String getName() {
return name;
}
+ public void setName(String name) {
+ this.name = name;
+ }
+
public String getNamespace() {
return namespace;
}
+ public void setNamespace(String namespace) {
+ this.namespace = namespace;
+ }
+
@JsonProperty(KruizeConstants.JSONKeys.CONTAINERS)
public List getContainerAPIObjects() {
return containerAPIObjects;
diff --git a/src/main/java/com/autotune/analyzer/workerimpl/BulkJobManager.java b/src/main/java/com/autotune/analyzer/workerimpl/BulkJobManager.java
new file mode 100644
index 000000000..985475313
--- /dev/null
+++ b/src/main/java/com/autotune/analyzer/workerimpl/BulkJobManager.java
@@ -0,0 +1,229 @@
+package com.autotune.analyzer.workerimpl;
+
+
+import com.autotune.analyzer.kruizeObject.KruizeObject;
+import com.autotune.analyzer.kruizeObject.RecommendationSettings;
+import com.autotune.analyzer.serviceObjects.*;
+import com.autotune.analyzer.utils.AnalyzerConstants;
+import com.autotune.common.data.ValidationOutputData;
+import com.autotune.common.data.dataSourceMetadata.*;
+import com.autotune.common.datasource.DataSourceInfo;
+import com.autotune.common.datasource.DataSourceManager;
+import com.autotune.common.k8sObjects.TrialSettings;
+import com.autotune.common.utils.CommonUtils;
+import com.autotune.database.service.ExperimentDBService;
+import com.autotune.operator.KruizeDeploymentInfo;
+import com.autotune.utils.KruizeConstants;
+import com.autotune.utils.Utils;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.net.HttpURLConnection;
+import java.net.MalformedURLException;
+import java.net.ProtocolException;
+import java.net.URL;
+import java.sql.Timestamp;
+import java.time.Instant;
+import java.time.LocalDateTime;
+import java.time.ZoneOffset;
+import java.time.format.DateTimeFormatter;
+import java.util.*;
+import java.util.concurrent.CopyOnWriteArrayList;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Executors;
+
+public class BulkJobManager implements Runnable {
+ private static final Logger LOGGER = LoggerFactory.getLogger(BulkJobManager.class);
+
+ private String jobID;
+ private Map jobStatusMap;
+ private BulkInput bulkInput;
+
+ public BulkJobManager(String jobID, Map jobStatusMap, BulkInput payload) {
+ this.jobID = jobID;
+ this.jobStatusMap = jobStatusMap;
+ this.bulkInput = payload;
+ }
+
+ public static List appendExperiments(List allExperiments, String experimentName) {
+ allExperiments.add(experimentName);
+ return allExperiments;
+ }
+
+ @Override
+ public void run() {
+ try {
+
+ String uniqueKey = null;
+ // Process labels in the 'include' section
+ if (this.bulkInput.getFilter() != null && this.bulkInput.getFilter().getInclude() != null) {
+ // Initialize StringBuilder for uniqueKey
+ StringBuilder includeLabelsBuilder = new StringBuilder();
+ Map includeLabels = this.bulkInput.getFilter().getInclude().getLabels();
+ if (includeLabels != null && !includeLabels.isEmpty()) {
+ includeLabels.forEach((key, value) ->
+ includeLabelsBuilder.append(key).append("=").append("\"" + value + "\"").append(",")
+ );
+ // Remove trailing comma
+ if (includeLabelsBuilder.length() > 0) {
+ includeLabelsBuilder.setLength(includeLabelsBuilder.length() - 1);
+ }
+ LOGGER.info("Include Labels: " + includeLabelsBuilder.toString());
+ uniqueKey = includeLabelsBuilder.toString();
+ }
+ }
+ DataSourceMetadataInfo metadataInfo = null;
+ DataSourceManager dataSourceManager = new DataSourceManager();
+ DataSourceInfo datasource = CommonUtils.getDataSourceInfo("prometheus-1");
+
+
+ if (null != this.bulkInput.getTime_range() && this.bulkInput.getTime_range().getStart() != null && this.bulkInput.getTime_range().getEnd() != null) {
+ // Extract interval start and end times
+ String intervalEndTimeStr = this.bulkInput.getTime_range().getStart();
+ String intervalStartTimeStr = this.bulkInput.getTime_range().getEnd();
+ long interval_end_time_epoc = 0;
+ long interval_start_time_epoc = 0;
+ LocalDateTime localDateTime = LocalDateTime.parse(intervalEndTimeStr, DateTimeFormatter.ofPattern(KruizeConstants.DateFormats.STANDARD_JSON_DATE_FORMAT));
+ interval_end_time_epoc = localDateTime.toEpochSecond(ZoneOffset.UTC);
+ Timestamp interval_end_time = Timestamp.from(localDateTime.toInstant(ZoneOffset.UTC));
+ localDateTime = LocalDateTime.parse(intervalStartTimeStr, DateTimeFormatter.ofPattern(KruizeConstants.DateFormats.STANDARD_JSON_DATE_FORMAT));
+ interval_start_time_epoc = localDateTime.toEpochSecond(ZoneOffset.UTC);
+ Timestamp interval_start_time = Timestamp.from(localDateTime.toInstant(ZoneOffset.UTC));
+
+ int steps = 15 * KruizeConstants.TimeConv.NO_OF_SECONDS_PER_MINUTE;
+ //Get metaData
+ //example metadataInfo = dataSourceManager.importMetadataFromDataSource(datasource, uniqueKey, interval_start_time_epoc, interval_end_time_epoc, steps);
+ } else {
+ //Get metaData
+ //metadataInfo = dataSourceManager.importMetadataFromDataSource(datasource, uniqueKey, 0, 0, 0);
+ }
+ List recommendationsRequiredExperiments = new CopyOnWriteArrayList<>();
+ if (null == metadataInfo) {
+ jobStatusMap.get(jobID).setStatus("COMPLETED");
+ } else {
+ Collection dataSourceCollection = metadataInfo.getDataSourceHashMap().values();
+ for (DataSource ds : dataSourceCollection) {
+ HashMap clusterHashMap = ds.getDataSourceClusterHashMap();
+ for (DataSourceCluster dsc : clusterHashMap.values()) {
+ HashMap namespaceHashMap = dsc.getDataSourceNamespaceHashMap();
+ for (DataSourceNamespace namespace : namespaceHashMap.values()) {
+ HashMap dataSourceWorkloadHashMap = namespace.getDataSourceWorkloadHashMap();
+ if (dataSourceWorkloadHashMap != null) {
+ for (DataSourceWorkload dsw : dataSourceWorkloadHashMap.values()) {
+ HashMap dataSourceContainerHashMap = dsw.getDataSourceContainerHashMap();
+ if (dataSourceContainerHashMap != null) {
+ for (DataSourceContainer dc : dataSourceContainerHashMap.values()) {
+ CreateExperimentAPIObject createExperimentAPIObject = new CreateExperimentAPIObject();
+ createExperimentAPIObject.setMode("monitor");
+ createExperimentAPIObject.setTargetCluster("local");
+ createExperimentAPIObject.setApiVersion("v2.0");
+ String experiment_name = "prometheus-1" + "-" + dsc.getDataSourceClusterName() + "-" + namespace.getDataSourceNamespaceName()
+ + "-" + dsw.getDataSourceWorkloadName() + "(" + dsw.getDataSourceWorkloadType() + ")" + "-" + dc.getDataSourceContainerName();
+ createExperimentAPIObject.setExperimentName(experiment_name);
+ createExperimentAPIObject.setDatasource("prometheus-1");
+ createExperimentAPIObject.setClusterName(dsc.getDataSourceClusterName());
+ createExperimentAPIObject.setPerformanceProfile("resource-optimization-openshift");
+ List kubernetesAPIObjectList = new ArrayList<>();
+ KubernetesAPIObject kubernetesAPIObject = new KubernetesAPIObject();
+ ContainerAPIObject cao = new ContainerAPIObject(dc.getDataSourceContainerName(),
+ dc.getDataSourceContainerImageName(), null, null);
+ kubernetesAPIObject.setContainerAPIObjects(Arrays.asList(cao));
+ kubernetesAPIObject.setName(dsw.getDataSourceWorkloadName());
+ kubernetesAPIObject.setType(dsw.getDataSourceWorkloadType());
+ kubernetesAPIObject.setNamespace(namespace.getDataSourceNamespaceName());
+ kubernetesAPIObjectList.add(kubernetesAPIObject);
+ createExperimentAPIObject.setKubernetesObjects(kubernetesAPIObjectList);
+ RecommendationSettings rs = new RecommendationSettings();
+ rs.setThreshold(0.1);
+ createExperimentAPIObject.setRecommendationSettings(rs);
+ TrialSettings trialSettings = new TrialSettings();
+ trialSettings.setMeasurement_durationMinutes("15min");
+ createExperimentAPIObject.setTrialSettings(trialSettings);
+ List kruizeExpList = new ArrayList<>();
+
+ createExperimentAPIObject.setExperiment_id(Utils.generateID(createExperimentAPIObject.toString()));
+ createExperimentAPIObject.setStatus(AnalyzerConstants.ExperimentStatus.IN_PROGRESS);
+
+ try {
+ ValidationOutputData output = new ExperimentDBService().addExperimentToDB(createExperimentAPIObject);
+ if (output.isSuccess()) {
+ jobStatusMap.get(jobID).getData().getExperiments().setNewExperiments(
+ appendExperiments(jobStatusMap.get(jobID).getData().getExperiments().getNewExperiments(), experiment_name)
+ );
+ }
+ recommendationsRequiredExperiments.add(experiment_name);
+ } catch (Exception e) {
+ LOGGER.info(e.getMessage());
+ }
+ }
+ }
+
+ }
+ }
+ }
+ }
+ }
+ jobStatusMap.get(jobID).setStatus("INPROGRESS");
+ jobStatusMap.get(jobID).getData().getRecommendations().getData().setInqueue(recommendationsRequiredExperiments);
+ jobStatusMap.get(jobID).getData().getRecommendations().setTotalCount(recommendationsRequiredExperiments.size());
+
+ }
+ ExecutorService executor = Executors.newFixedThreadPool(3);
+ for (String name : recommendationsRequiredExperiments) {
+ executor.submit(() -> {
+ URL url = null;
+ try {
+ url = new URL(String.format(KruizeDeploymentInfo.recommendations_url, name));
+ } catch (MalformedURLException e) {
+ throw new RuntimeException(e);
+ }
+ HttpURLConnection connection = null;
+ try {
+ connection = (HttpURLConnection) url.openConnection();
+ } catch (IOException e) {
+ LOGGER.error(e.getMessage());
+ throw new RuntimeException(e);
+ }
+ try {
+ connection.setRequestMethod("POST");
+ } catch (ProtocolException e) {
+ LOGGER.error(e.getMessage());
+ throw new RuntimeException(e);
+ }
+ // Get the response code from /helloworld
+ int statusCode = 0;
+ try {
+ jobStatusMap.get(jobID).getData().getRecommendations().getData().moveToProgress(name);
+ LOGGER.info(String.format(KruizeDeploymentInfo.recommendations_url, name));
+ statusCode = connection.getResponseCode();
+ LOGGER.info(String.format(KruizeDeploymentInfo.recommendations_url, name));
+ } catch (IOException e) {
+ LOGGER.error(e.getMessage());
+ throw new RuntimeException(e);
+ }
+
+ if (statusCode == HttpURLConnection.HTTP_CREATED) {
+ jobStatusMap.get(jobID).getData().getRecommendations().getData().moveToCompleted(name);
+ } else {
+ jobStatusMap.get(jobID).getData().getRecommendations().getData().moveToFailed(name);
+ }
+ jobStatusMap.get(jobID).setProgress(jobStatusMap.get(jobID).getData().getRecommendations().getData().completionPercentage());
+ if (jobStatusMap.get(jobID).getProgress() == 100) {
+ jobStatusMap.get(jobID).setStatus("COMPLETED"); // Mark the job as completed
+ jobStatusMap.get(jobID).setEndTime(Instant.now());
+ jobStatusMap.get(jobID).getData().getRecommendations().setCompletedCount(
+ jobStatusMap.get(jobID).getData().getRecommendations().getData().getCompleted().size()
+ );
+ }
+ // Close the connection
+ connection.disconnect();
+ });
+ }
+ } catch (Exception e) {
+ LOGGER.error(e.getMessage());
+ e.printStackTrace();
+
+ }
+ }
+}
diff --git a/src/main/java/com/autotune/operator/KruizeDeploymentInfo.java b/src/main/java/com/autotune/operator/KruizeDeploymentInfo.java
index 4be00ff62..c7ddcff9a 100644
--- a/src/main/java/com/autotune/operator/KruizeDeploymentInfo.java
+++ b/src/main/java/com/autotune/operator/KruizeDeploymentInfo.java
@@ -79,6 +79,7 @@ public class KruizeDeploymentInfo {
public static Integer bulk_update_results_limit = 100;
public static Boolean local = false;
public static Boolean log_http_req_resp = false;
+ public static String recommendations_url;
public static int generate_recommendations_date_range_limit_in_days = 15;
public static Integer delete_partition_threshold_in_days = DELETE_PARTITION_THRESHOLD_IN_DAYS;
diff --git a/src/main/java/com/autotune/utils/KruizeConstants.java b/src/main/java/com/autotune/utils/KruizeConstants.java
index d44ffdd16..f8137448e 100644
--- a/src/main/java/com/autotune/utils/KruizeConstants.java
+++ b/src/main/java/com/autotune/utils/KruizeConstants.java
@@ -674,6 +674,7 @@ public static final class KRUIZE_CONFIG_ENV_NAME {
public static final String CLOUDWATCH_LOGS_LOG_LEVEL = "logging_cloudwatch_logLevel";
public static final String LOCAL = "local";
public static final String LOG_HTTP_REQ_RESP = "logAllHttpReqAndResp";
+ public static final String RECOMMENDATIONS_URL = "recommendationsURL";
}
public static final class RecommendationEngineConstants {
From ffe14a9eec97a359bc877cd8759021f763b9ead1 Mon Sep 17 00:00:00 2001
From: msvinaykumar
Date: Tue, 8 Oct 2024 14:30:08 +0530
Subject: [PATCH 38/64] incorporated review comments
Signed-off-by: msvinaykumar
---
.../CreateExperimentConfigBean.java | 93 +++++++++++++++++++
.../analyzer/workerimpl/BulkJobManager.java | 41 ++++----
.../operator/KruizeDeploymentInfo.java | 1 +
.../com/autotune/utils/KruizeConstants.java | 19 ++++
4 files changed, 136 insertions(+), 18 deletions(-)
create mode 100644 src/main/java/com/autotune/analyzer/kruizeObject/CreateExperimentConfigBean.java
diff --git a/src/main/java/com/autotune/analyzer/kruizeObject/CreateExperimentConfigBean.java b/src/main/java/com/autotune/analyzer/kruizeObject/CreateExperimentConfigBean.java
new file mode 100644
index 000000000..2f521a212
--- /dev/null
+++ b/src/main/java/com/autotune/analyzer/kruizeObject/CreateExperimentConfigBean.java
@@ -0,0 +1,93 @@
+package com.autotune.analyzer.kruizeObject;
+
+public class CreateExperimentConfigBean {
+
+ // Private fields
+ private String mode;
+ private String target;
+ private String version;
+ private String datasourceName;
+ private String performanceProfile;
+ private double threshold;
+ private String measurementDurationStr;
+ private int measurementDuration;
+
+ // Getters and Setters
+ public String getMode() {
+ return mode;
+ }
+
+ public void setMode(String mode) {
+ this.mode = mode;
+ }
+
+ public String getTarget() {
+ return target;
+ }
+
+ public void setTarget(String target) {
+ this.target = target;
+ }
+
+ public String getVersion() {
+ return version;
+ }
+
+ public void setVersion(String version) {
+ this.version = version;
+ }
+
+ public String getDatasourceName() {
+ return datasourceName;
+ }
+
+ public void setDatasourceName(String datasourceName) {
+ this.datasourceName = datasourceName;
+ }
+
+ public String getPerformanceProfile() {
+ return performanceProfile;
+ }
+
+ public void setPerformanceProfile(String performanceProfile) {
+ this.performanceProfile = performanceProfile;
+ }
+
+ public double getThreshold() {
+ return threshold;
+ }
+
+ public void setThreshold(double threshold) {
+ this.threshold = threshold;
+ }
+
+ public String getMeasurementDurationStr() {
+ return measurementDurationStr;
+ }
+
+ public void setMeasurementDurationStr(String measurementDurationStr) {
+ this.measurementDurationStr = measurementDurationStr;
+ }
+
+ public int getMeasurementDuration() {
+ return measurementDuration;
+ }
+
+ public void setMeasurementDuration(int measurementDuration) {
+ this.measurementDuration = measurementDuration;
+ }
+
+ @Override
+ public String toString() {
+ return "MonitoringConfigBean{" +
+ "mode='" + mode + '\'' +
+ ", target='" + target + '\'' +
+ ", version='" + version + '\'' +
+ ", datasourceName='" + datasourceName + '\'' +
+ ", performanceProfile='" + performanceProfile + '\'' +
+ ", threshold=" + threshold +
+ ", measurementDurationStr='" + measurementDurationStr + '\'' +
+ ", measurementDuration=" + measurementDuration +
+ '}';
+ }
+}
diff --git a/src/main/java/com/autotune/analyzer/workerimpl/BulkJobManager.java b/src/main/java/com/autotune/analyzer/workerimpl/BulkJobManager.java
index 985475313..171d37f6f 100644
--- a/src/main/java/com/autotune/analyzer/workerimpl/BulkJobManager.java
+++ b/src/main/java/com/autotune/analyzer/workerimpl/BulkJobManager.java
@@ -33,6 +33,9 @@
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
+import static com.autotune.operator.KruizeDeploymentInfo.bulk_thread_pool_size;
+import static com.autotune.utils.KruizeConstants.KRUIZE_BULK_API.*;
+
public class BulkJobManager implements Runnable {
private static final Logger LOGGER = LoggerFactory.getLogger(BulkJobManager.class);
@@ -73,9 +76,12 @@ public void run() {
uniqueKey = includeLabelsBuilder.toString();
}
}
+ if (null == this.bulkInput.getDatasource()) {
+ this.bulkInput.setDatasource(CREATE_EXPERIMENT_CONFIG_BEAN.getDatasourceName());
+ }
DataSourceMetadataInfo metadataInfo = null;
DataSourceManager dataSourceManager = new DataSourceManager();
- DataSourceInfo datasource = CommonUtils.getDataSourceInfo("prometheus-1");
+ DataSourceInfo datasource = CommonUtils.getDataSourceInfo(this.bulkInput.getDatasource());
if (null != this.bulkInput.getTime_range() && this.bulkInput.getTime_range().getStart() != null && this.bulkInput.getTime_range().getEnd() != null) {
@@ -90,17 +96,16 @@ public void run() {
localDateTime = LocalDateTime.parse(intervalStartTimeStr, DateTimeFormatter.ofPattern(KruizeConstants.DateFormats.STANDARD_JSON_DATE_FORMAT));
interval_start_time_epoc = localDateTime.toEpochSecond(ZoneOffset.UTC);
Timestamp interval_start_time = Timestamp.from(localDateTime.toInstant(ZoneOffset.UTC));
-
- int steps = 15 * KruizeConstants.TimeConv.NO_OF_SECONDS_PER_MINUTE;
- //Get metaData
+ int steps = CREATE_EXPERIMENT_CONFIG_BEAN.getMeasurementDuration() * KruizeConstants.TimeConv.NO_OF_SECONDS_PER_MINUTE; // todo fetch experiment recommendations setting measurement
+ //TODO Get metaData
//example metadataInfo = dataSourceManager.importMetadataFromDataSource(datasource, uniqueKey, interval_start_time_epoc, interval_end_time_epoc, steps);
} else {
- //Get metaData
+ //TODO Get metaData
//metadataInfo = dataSourceManager.importMetadataFromDataSource(datasource, uniqueKey, 0, 0, 0);
}
List recommendationsRequiredExperiments = new CopyOnWriteArrayList<>();
if (null == metadataInfo) {
- jobStatusMap.get(jobID).setStatus("COMPLETED");
+ jobStatusMap.get(jobID).setStatus(COMPLETED);
} else {
Collection dataSourceCollection = metadataInfo.getDataSourceHashMap().values();
for (DataSource ds : dataSourceCollection) {
@@ -115,15 +120,15 @@ public void run() {
if (dataSourceContainerHashMap != null) {
for (DataSourceContainer dc : dataSourceContainerHashMap.values()) {
CreateExperimentAPIObject createExperimentAPIObject = new CreateExperimentAPIObject();
- createExperimentAPIObject.setMode("monitor");
- createExperimentAPIObject.setTargetCluster("local");
- createExperimentAPIObject.setApiVersion("v2.0");
- String experiment_name = "prometheus-1" + "-" + dsc.getDataSourceClusterName() + "-" + namespace.getDataSourceNamespaceName()
- + "-" + dsw.getDataSourceWorkloadName() + "(" + dsw.getDataSourceWorkloadType() + ")" + "-" + dc.getDataSourceContainerName();
+ createExperimentAPIObject.setMode(CREATE_EXPERIMENT_CONFIG_BEAN.getMode());
+ createExperimentAPIObject.setTargetCluster(CREATE_EXPERIMENT_CONFIG_BEAN.getTarget());
+ createExperimentAPIObject.setApiVersion(CREATE_EXPERIMENT_CONFIG_BEAN.getVersion());
+ String experiment_name = this.bulkInput.getDatasource() + "|" + dsc.getDataSourceClusterName() + "|" + namespace.getDataSourceNamespaceName()
+ + "|" + dsw.getDataSourceWorkloadName() + "(" + dsw.getDataSourceWorkloadType() + ")" + "|" + dc.getDataSourceContainerName();
createExperimentAPIObject.setExperimentName(experiment_name);
- createExperimentAPIObject.setDatasource("prometheus-1");
+ createExperimentAPIObject.setDatasource(this.bulkInput.getDatasource());
createExperimentAPIObject.setClusterName(dsc.getDataSourceClusterName());
- createExperimentAPIObject.setPerformanceProfile("resource-optimization-openshift");
+ createExperimentAPIObject.setPerformanceProfile(CREATE_EXPERIMENT_CONFIG_BEAN.getPerformanceProfile());
List kubernetesAPIObjectList = new ArrayList<>();
KubernetesAPIObject kubernetesAPIObject = new KubernetesAPIObject();
ContainerAPIObject cao = new ContainerAPIObject(dc.getDataSourceContainerName(),
@@ -135,10 +140,10 @@ public void run() {
kubernetesAPIObjectList.add(kubernetesAPIObject);
createExperimentAPIObject.setKubernetesObjects(kubernetesAPIObjectList);
RecommendationSettings rs = new RecommendationSettings();
- rs.setThreshold(0.1);
+ rs.setThreshold(CREATE_EXPERIMENT_CONFIG_BEAN.getThreshold());
createExperimentAPIObject.setRecommendationSettings(rs);
TrialSettings trialSettings = new TrialSettings();
- trialSettings.setMeasurement_durationMinutes("15min");
+ trialSettings.setMeasurement_durationMinutes(CREATE_EXPERIMENT_CONFIG_BEAN.getMeasurementDurationStr());
createExperimentAPIObject.setTrialSettings(trialSettings);
List kruizeExpList = new ArrayList<>();
@@ -164,12 +169,12 @@ public void run() {
}
}
}
- jobStatusMap.get(jobID).setStatus("INPROGRESS");
+ jobStatusMap.get(jobID).setStatus(IN_PROGRESS);
jobStatusMap.get(jobID).getData().getRecommendations().getData().setInqueue(recommendationsRequiredExperiments);
jobStatusMap.get(jobID).getData().getRecommendations().setTotalCount(recommendationsRequiredExperiments.size());
}
- ExecutorService executor = Executors.newFixedThreadPool(3);
+ ExecutorService executor = Executors.newFixedThreadPool(bulk_thread_pool_size);
for (String name : recommendationsRequiredExperiments) {
executor.submit(() -> {
URL url = null;
@@ -210,7 +215,7 @@ public void run() {
}
jobStatusMap.get(jobID).setProgress(jobStatusMap.get(jobID).getData().getRecommendations().getData().completionPercentage());
if (jobStatusMap.get(jobID).getProgress() == 100) {
- jobStatusMap.get(jobID).setStatus("COMPLETED"); // Mark the job as completed
+ jobStatusMap.get(jobID).setStatus(COMPLETED); // Mark the job as completed
jobStatusMap.get(jobID).setEndTime(Instant.now());
jobStatusMap.get(jobID).getData().getRecommendations().setCompletedCount(
jobStatusMap.get(jobID).getData().getRecommendations().getData().getCompleted().size()
diff --git a/src/main/java/com/autotune/operator/KruizeDeploymentInfo.java b/src/main/java/com/autotune/operator/KruizeDeploymentInfo.java
index c7ddcff9a..5ce3e4ee5 100644
--- a/src/main/java/com/autotune/operator/KruizeDeploymentInfo.java
+++ b/src/main/java/com/autotune/operator/KruizeDeploymentInfo.java
@@ -80,6 +80,7 @@ public class KruizeDeploymentInfo {
public static Boolean local = false;
public static Boolean log_http_req_resp = false;
public static String recommendations_url;
+ public static Integer bulk_thread_pool_size = 3;
public static int generate_recommendations_date_range_limit_in_days = 15;
public static Integer delete_partition_threshold_in_days = DELETE_PARTITION_THRESHOLD_IN_DAYS;
diff --git a/src/main/java/com/autotune/utils/KruizeConstants.java b/src/main/java/com/autotune/utils/KruizeConstants.java
index f8137448e..b81264ae3 100644
--- a/src/main/java/com/autotune/utils/KruizeConstants.java
+++ b/src/main/java/com/autotune/utils/KruizeConstants.java
@@ -17,6 +17,8 @@
package com.autotune.utils;
+import com.autotune.analyzer.kruizeObject.CreateExperimentConfigBean;
+
import java.text.SimpleDateFormat;
import java.util.Locale;
import java.util.TimeZone;
@@ -675,6 +677,7 @@ public static final class KRUIZE_CONFIG_ENV_NAME {
public static final String LOCAL = "local";
public static final String LOG_HTTP_REQ_RESP = "logAllHttpReqAndResp";
public static final String RECOMMENDATIONS_URL = "recommendationsURL";
+ public static final String BULK_THREAD_POOL_SIZE = "bulkThreadPoolSize";
}
public static final class RecommendationEngineConstants {
@@ -768,5 +771,21 @@ public static final class KRUIZE_BULK_API {
public static final String ERROR = "error";
public static final String JOB_NOT_FOUND_MSG = "Job not found";
public static final String IN_PROGRESS = "IN_PROGRESS";
+ public static final String COMPLETED = "COMPLETED";
+ // Static final MonitoringConfigBean instance
+ public static final CreateExperimentConfigBean CREATE_EXPERIMENT_CONFIG_BEAN;
+
+ // Static block to initialize the Bean
+ static {
+ CREATE_EXPERIMENT_CONFIG_BEAN = new CreateExperimentConfigBean();
+ CREATE_EXPERIMENT_CONFIG_BEAN.setMode("monitor");
+ CREATE_EXPERIMENT_CONFIG_BEAN.setTarget("local");
+ CREATE_EXPERIMENT_CONFIG_BEAN.setVersion("v2.0");
+ CREATE_EXPERIMENT_CONFIG_BEAN.setDatasourceName("prometheus-1");
+ CREATE_EXPERIMENT_CONFIG_BEAN.setPerformanceProfile("resource-optimization-local-monitoring");
+ CREATE_EXPERIMENT_CONFIG_BEAN.setThreshold(0.1);
+ CREATE_EXPERIMENT_CONFIG_BEAN.setMeasurementDurationStr("15min");
+ CREATE_EXPERIMENT_CONFIG_BEAN.setMeasurementDuration(15);
+ }
}
}
From 7ee24116d9c096aa666692e455aabc0835657ec7 Mon Sep 17 00:00:00 2001
From: msvinaykumar
Date: Tue, 8 Oct 2024 15:58:00 +0530
Subject: [PATCH 39/64] incorporated review comments
Signed-off-by: msvinaykumar
---
.../CreateExperimentConfigBean.java | 18 ++++++++++++++++++
.../com/autotune/utils/KruizeConstants.java | 2 +-
2 files changed, 19 insertions(+), 1 deletion(-)
diff --git a/src/main/java/com/autotune/analyzer/kruizeObject/CreateExperimentConfigBean.java b/src/main/java/com/autotune/analyzer/kruizeObject/CreateExperimentConfigBean.java
index 2f521a212..5303441f6 100644
--- a/src/main/java/com/autotune/analyzer/kruizeObject/CreateExperimentConfigBean.java
+++ b/src/main/java/com/autotune/analyzer/kruizeObject/CreateExperimentConfigBean.java
@@ -1,5 +1,23 @@
+/*******************************************************************************
+ * Copyright (c) 2022, 2022 Red Hat, IBM Corporation and others.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *******************************************************************************/
package com.autotune.analyzer.kruizeObject;
+/**
+ * THis is a placeholder class for bulkAPI createExperiment template to store defaults
+ */
public class CreateExperimentConfigBean {
// Private fields
diff --git a/src/main/java/com/autotune/utils/KruizeConstants.java b/src/main/java/com/autotune/utils/KruizeConstants.java
index b81264ae3..154d8c000 100644
--- a/src/main/java/com/autotune/utils/KruizeConstants.java
+++ b/src/main/java/com/autotune/utils/KruizeConstants.java
@@ -772,7 +772,7 @@ public static final class KRUIZE_BULK_API {
public static final String JOB_NOT_FOUND_MSG = "Job not found";
public static final String IN_PROGRESS = "IN_PROGRESS";
public static final String COMPLETED = "COMPLETED";
- // Static final MonitoringConfigBean instance
+ // TODO : Bulk API Create Experiments defaults
public static final CreateExperimentConfigBean CREATE_EXPERIMENT_CONFIG_BEAN;
// Static block to initialize the Bean
From eb02284610841fc8259a81416b51ff70b5aa7271 Mon Sep 17 00:00:00 2001
From: msvinaykumar
Date: Tue, 8 Oct 2024 16:04:12 +0530
Subject: [PATCH 40/64] incorporated review comments
Signed-off-by: msvinaykumar
---
.../analyzer/workerimpl/BulkJobManager.java | 41 +++++++++++++++++++
1 file changed, 41 insertions(+)
diff --git a/src/main/java/com/autotune/analyzer/workerimpl/BulkJobManager.java b/src/main/java/com/autotune/analyzer/workerimpl/BulkJobManager.java
index 171d37f6f..4b165ba4b 100644
--- a/src/main/java/com/autotune/analyzer/workerimpl/BulkJobManager.java
+++ b/src/main/java/com/autotune/analyzer/workerimpl/BulkJobManager.java
@@ -1,3 +1,18 @@
+/*******************************************************************************
+ * Copyright (c) 2022, 2022 Red Hat, IBM Corporation and others.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *******************************************************************************/
package com.autotune.analyzer.workerimpl;
@@ -36,6 +51,32 @@
import static com.autotune.operator.KruizeDeploymentInfo.bulk_thread_pool_size;
import static com.autotune.utils.KruizeConstants.KRUIZE_BULK_API.*;
+
+/**
+ * The `run` method processes bulk input to create experiments and generates resource optimization recommendations.
+ * It handles the creation of experiment names based on various data source components, makes HTTP POST requests
+ * to generate recommendations, and updates job statuses based on the progress of the recommendations.
+ *
+ *