diff --git a/monitoring/local_monitoring/create_humaneval_exp.json b/monitoring/local_monitoring/create_humaneval_exp.json new file mode 100644 index 00000000..42944065 --- /dev/null +++ b/monitoring/local_monitoring/create_humaneval_exp.json @@ -0,0 +1,27 @@ +[{ + "version": "v2.0", + "experiment_name": "monitor_humaneval_workload", + "cluster_name": "default", + "performance_profile": "resource-optimization-openshift", + "mode": "monitor", + "target_cluster": "local", + "datasource": "prometheus-1", + "kubernetes_objects": [ + { + "type": "statefulset", + "name": "human-eval-benchmark", + "namespace": "kruize-hackathon", + "containers": [ + { + "container_name": "human-eval-benchmark" + } + ] + } + ], + "trial_settings": { + "measurement_duration": "15min" + }, + "recommendation_settings": { + "threshold": "0.1" + } +}] diff --git a/monitoring/local_monitoring/create_llm-rag_exp.json b/monitoring/local_monitoring/create_llm-rag_exp.json new file mode 100644 index 00000000..c15f49e8 --- /dev/null +++ b/monitoring/local_monitoring/create_llm-rag_exp.json @@ -0,0 +1,27 @@ +[{ + "version": "v2.0", + "experiment_name": "monitor_llm-rag_workload", + "cluster_name": "default", + "performance_profile": "resource-optimization-openshift", + "mode": "monitor", + "target_cluster": "local", + "datasource": "prometheus-1", + "kubernetes_objects": [ + { + "type": "deployment", + "name": "llm", + "namespace": "kruize-hackathon", + "containers": [ + { + "container_name": "server" + } + ] + } + ], + "trial_settings": { + "measurement_duration": "15min" + }, + "recommendation_settings": { + "threshold": "0.1" + } +}] diff --git a/monitoring/local_monitoring/create_traininggpt_exp.json b/monitoring/local_monitoring/create_traininggpt_exp.json new file mode 100644 index 00000000..6e792ce4 --- /dev/null +++ b/monitoring/local_monitoring/create_traininggpt_exp.json @@ -0,0 +1,27 @@ +[{ + "version": "v2.0", + "experiment_name": "monitor_traininggpt_workload", + "cluster_name": "default", + "performance_profile": "resource-optimization-openshift", + "mode": "monitor", + "target_cluster": "local", + "datasource": "prometheus-1", + "kubernetes_objects": [ + { + "type": "statefulset", + "name": "training-gpt", + "namespace": "kruize-hackathon", + "containers": [ + { + "container_name": "training-gpt" + } + ] + } + ], + "trial_settings": { + "measurement_duration": "15min" + }, + "recommendation_settings": { + "threshold": "0.1" + } +}] diff --git a/monitoring/local_monitoring/create_trainingttm_exp.json b/monitoring/local_monitoring/create_trainingttm_exp.json new file mode 100644 index 00000000..b6ebb753 --- /dev/null +++ b/monitoring/local_monitoring/create_trainingttm_exp.json @@ -0,0 +1,27 @@ +[{ + "version": "v2.0", + "experiment_name": "monitor_traininggpt_workload", + "cluster_name": "default", + "performance_profile": "resource-optimization-openshift", + "mode": "monitor", + "target_cluster": "local", + "datasource": "prometheus-1", + "kubernetes_objects": [ + { + "type": "statefulset", + "name": "training-ttm", + "namespace": "kruize-hackathon", + "containers": [ + { + "container_name": "training-ttm" + } + ] + } + ], + "trial_settings": { + "measurement_duration": "15min" + }, + "recommendation_settings": { + "threshold": "0.1" + } +}]