From 922aba1eb2c9a7c33fcd54ad64d265111d6e8d11 Mon Sep 17 00:00:00 2001 From: kusumachalasani Date: Tue, 16 Jul 2024 16:02:49 +0530 Subject: [PATCH 1/3] jsons Signed-off-by: kusumachalasani --- .../local_monitoring/create_llm-rag_exp.json | 28 +++++++++++++++++++ 1 file changed, 28 insertions(+) create mode 100644 monitoring/local_monitoring/create_llm-rag_exp.json diff --git a/monitoring/local_monitoring/create_llm-rag_exp.json b/monitoring/local_monitoring/create_llm-rag_exp.json new file mode 100644 index 00000000..80cb8a28 --- /dev/null +++ b/monitoring/local_monitoring/create_llm-rag_exp.json @@ -0,0 +1,28 @@ +[{ + "version": "v2.0", + "experiment_name": "monitor_llm-rag_workload", + "cluster_name": "default", + "performance_profile": "resource-optimization-openshift", + "mode": "monitor", + "target_cluster": "local", + "datasource": "prometheus-1", + "kubernetes_objects": [ + { + "type": "deployment", + "name": "llm", + "namespace": "kruize-hackathon", + "containers": [ + { + "container_image_name": "kruize/tfb-postgres-openshift:latest", + "container_name": "server" + } + ] + } + ], + "trial_settings": { + "measurement_duration": "15min" + }, + "recommendation_settings": { + "threshold": "0.1" + } +}] From 31d0f08dc058e2bae0dfca98a7275dc7d174004a Mon Sep 17 00:00:00 2001 From: kusumachalasani Date: Tue, 16 Jul 2024 16:09:09 +0530 Subject: [PATCH 2/3] acceleratejsons Signed-off-by: kusumachalasani --- .../create_humaneval_exp.json | 27 +++++++++++++++++++ .../local_monitoring/create_llm-rag_exp.json | 1 - .../create_traininggpt_exp.json | 27 +++++++++++++++++++ 3 files changed, 54 insertions(+), 1 deletion(-) create mode 100644 monitoring/local_monitoring/create_humaneval_exp.json create mode 100644 monitoring/local_monitoring/create_traininggpt_exp.json diff --git a/monitoring/local_monitoring/create_humaneval_exp.json b/monitoring/local_monitoring/create_humaneval_exp.json new file mode 100644 index 00000000..42944065 --- /dev/null +++ b/monitoring/local_monitoring/create_humaneval_exp.json @@ -0,0 +1,27 @@ +[{ + "version": "v2.0", + "experiment_name": "monitor_humaneval_workload", + "cluster_name": "default", + "performance_profile": "resource-optimization-openshift", + "mode": "monitor", + "target_cluster": "local", + "datasource": "prometheus-1", + "kubernetes_objects": [ + { + "type": "statefulset", + "name": "human-eval-benchmark", + "namespace": "kruize-hackathon", + "containers": [ + { + "container_name": "human-eval-benchmark" + } + ] + } + ], + "trial_settings": { + "measurement_duration": "15min" + }, + "recommendation_settings": { + "threshold": "0.1" + } +}] diff --git a/monitoring/local_monitoring/create_llm-rag_exp.json b/monitoring/local_monitoring/create_llm-rag_exp.json index 80cb8a28..c15f49e8 100644 --- a/monitoring/local_monitoring/create_llm-rag_exp.json +++ b/monitoring/local_monitoring/create_llm-rag_exp.json @@ -13,7 +13,6 @@ "namespace": "kruize-hackathon", "containers": [ { - "container_image_name": "kruize/tfb-postgres-openshift:latest", "container_name": "server" } ] diff --git a/monitoring/local_monitoring/create_traininggpt_exp.json b/monitoring/local_monitoring/create_traininggpt_exp.json new file mode 100644 index 00000000..384a5307 --- /dev/null +++ b/monitoring/local_monitoring/create_traininggpt_exp.json @@ -0,0 +1,27 @@ +[{ + "version": "v2.0", + "experiment_name": "monitor_traininggpt_workload", + "cluster_name": "default", + "performance_profile": "resource-optimization-openshift", + "mode": "monitor", + "target_cluster": "local", + "datasource": "prometheus-1", + "kubernetes_objects": [ + { + "type": "statefulset", + "name": "traininggpt", + "namespace": "kruize-hackathon", + "containers": [ + { + "container_name": "traininggpt" + } + ] + } + ], + "trial_settings": { + "measurement_duration": "15min" + }, + "recommendation_settings": { + "threshold": "0.1" + } +}] From 2d77dde93711f48b9051ffb4a09013e53ddfe67e Mon Sep 17 00:00:00 2001 From: kusumachalasani Date: Fri, 19 Jul 2024 22:17:00 +0530 Subject: [PATCH 3/3] update exp jsons Signed-off-by: kusumachalasani --- .../create_traininggpt_exp.json | 4 +-- .../create_trainingttm_exp.json | 27 +++++++++++++++++++ 2 files changed, 29 insertions(+), 2 deletions(-) create mode 100644 monitoring/local_monitoring/create_trainingttm_exp.json diff --git a/monitoring/local_monitoring/create_traininggpt_exp.json b/monitoring/local_monitoring/create_traininggpt_exp.json index 384a5307..6e792ce4 100644 --- a/monitoring/local_monitoring/create_traininggpt_exp.json +++ b/monitoring/local_monitoring/create_traininggpt_exp.json @@ -9,11 +9,11 @@ "kubernetes_objects": [ { "type": "statefulset", - "name": "traininggpt", + "name": "training-gpt", "namespace": "kruize-hackathon", "containers": [ { - "container_name": "traininggpt" + "container_name": "training-gpt" } ] } diff --git a/monitoring/local_monitoring/create_trainingttm_exp.json b/monitoring/local_monitoring/create_trainingttm_exp.json new file mode 100644 index 00000000..b6ebb753 --- /dev/null +++ b/monitoring/local_monitoring/create_trainingttm_exp.json @@ -0,0 +1,27 @@ +[{ + "version": "v2.0", + "experiment_name": "monitor_traininggpt_workload", + "cluster_name": "default", + "performance_profile": "resource-optimization-openshift", + "mode": "monitor", + "target_cluster": "local", + "datasource": "prometheus-1", + "kubernetes_objects": [ + { + "type": "statefulset", + "name": "training-ttm", + "namespace": "kruize-hackathon", + "containers": [ + { + "container_name": "training-ttm" + } + ] + } + ], + "trial_settings": { + "measurement_duration": "15min" + }, + "recommendation_settings": { + "threshold": "0.1" + } +}]