Merge pull request #988 from chandrams/scale_test_auto

Included the latest scalability test and automation scripts
kruize · Feb 2, 2024 · 5bc74c0 · 5bc74c0
2 parents 71d9fbd + 4460d80
commit 5bc74c0
Show file tree

Hide file tree

Showing 8 changed files with 1,031 additions and 0 deletions.
diff --git a/tests/scripts/remote_monitoring_tests/scalability_test.md b/tests/scripts/remote_monitoring_tests/scalability_test.md
@@ -0,0 +1,69 @@
+# **Kruize Remote monitoring scalability test**
+
+Kruize Remote monitoring scalability test validates the behaviour of [Kruize remote monitoring APIs](/design/MonitoringModeAPI.md) by scaling kruize pods and running multiple experiments with 15 days of metrics uploaded and captures various metrics.
+
+## Test description
+
+   The test does the following:
+   - Deploys kruize in non-CRD mode using the [deploy script](https://github.com/kruize/autotune/blob/master/deploy.sh) from the autotune repo
+   - Scales kruize replicas to 10
+   - Exposes kruize service
+   - Creates a resource optimization performance profile using the [createPerformanceProfile API](/design/PerformanceProfileAPI.md) 
+   - Creates the specified no. of client threads that run in parallel, creating multiple experiments and uploading the metrics results for 15 days
+
+## Prerequisites for running the tests:
+- Openshift cluster
+- Tools like kubectl, oc
+
+## How to run the test?
+
+Use the below command to test :
+
+```
+cd <KRUIZE_REPO>/tests/scripts/remote_monitoring_tests/scale_test
+./remote_monitoring_tests/scale_test/remote_monitoring_scale_test_bulk.sh [-i Kruize image] [-r results directory path] [-u No. of experiments (default - 5000)] [-d No. of days of results (default - 15)] [-n No. of clients (default - 20)] [-m results duration interval in mins, (default - 15)] [-t interval hours (default - 6)] [-s Initial start date (default - 2023-01-10T00:00:00.000Z)] [-q query db interval in mins, (default - 10)]
+```
+
+Where values for remote_monitoring_scale_test_bulk.sh are:
+
+```
+usage: remote_monitoring_fault_tolerant_tests.sh 
+        [ -i ] : optional. Kruize docker image to be used for testing
+                 default - quay.io/kruize/autotune:mvp_demo
+	[ -r ] : Results directory path
+	[-u No. of experiments (default - 5000)]
+	[-d No. of days of results (default - 15)] 
+	[-n No. of clients (default - 20)]
+	[-m results duration interval in mins, (default - 15)]
+	[-t interval hours (default - 6)]
+	[-s Initial start date (default - 2023-01-10T00:00:00.000Z)]
+	[-q query db interval in mins, (default - 10)]
+```
+
+For example,
+
+```
+cd <KRUIZE_REPO>/tests/scripts/remote_monitoring_tests/scale_test
+./remote_monitoring_tests/scale_test/remote_monitoring_scale_test_bulk.sh -i quay.io/kruize/autotune_operator:0.0.20_mvp  -u 250  -d 15  -n 20 -t 6  -q 10  -s 2023-01-10T00:00:00.000Z  -r /tmp/scale_test_results
+
+```
+
+Once the tests are complete, manually check the logs for any exceptions or errors or crashes. Verify if the execution times captured in exec_time.log are as expected.
+
+Below commands are used in the script to capture the execution time and the count of experiments and results from the database:
+
+Commands used to capture the execution time:
+
+```
+grep -m28 -H 'Time elapsed:' *.log | awk -F '[:.]' '{ sum[$1] += ($4 * 3600) + ($5 * 60) + $6 } END { for (key in sum) { printf "%s: Total time elapsed: %02d:%02d:%02d\n", key, sum[key] / 3600, (sum[key] / 60) % 60, sum[key] % 60 } }' | sort
+
+```
+
+The above command captures the execution time for 7 days of metrics data upload, modify -m28 (-m<4 * 7> ) to -m<4 * num_days_of_res>
+
+Commands to fetch the count of experiments and results from the DB:
+
+```
+kubectl exec -it `kubectl get pods -o=name -n openshift-tuning | grep postgres` -n openshift-tuning -- psql -U admin -d kruizeDB -c "SELECT count(*) from public.kruize_experiments ;"; kubectl exec -it `kubectl get pods -o=name -n openshift-tuning | grep postgres` -n openshift-tuning -- psql -U admin -d kruizeDB -c "SELECT count(*) from public.kruize_results ;"
+
+```
diff --git a/tests/scripts/remote_monitoring_tests/scale_test/json_files/create_exp.json b/tests/scripts/remote_monitoring_tests/scale_test/json_files/create_exp.json
@@ -0,0 +1,31 @@
+{
+  "version": "1.0",
+  "experiment_name": "quarkus-resteasy-kruize-min-http-response-time-db_10",
+  "cluster_name": "cluster-one-division-bell",
+  "performance_profile": "resource-optimization-openshift",
+  "mode": "monitor",
+  "target_cluster": "remote",
+  "kubernetes_objects": [
+    {
+      "type": "deployment",
+      "name": "tfb-qrh-deployment_5",
+      "namespace": "default_5",
+      "containers": [
+        {
+          "container_image_name": "kruize/tfb-db:1.15",
+          "container_name": "tfb-server-0"
+        },
+        {
+          "container_image_name": "kruize/tfb-qrh:1.13.2.F_et17",
+          "container_name": "tfb-server-1"
+        }
+      ]
+    }
+  ],
+  "trial_settings": {
+    "measurement_duration": "15min"
+  },
+  "recommendation_settings": {
+    "threshold": "0.1"
+  }
+}
diff --git a/tests/scripts/remote_monitoring_tests/scale_test/json_files/profile.json b/tests/scripts/remote_monitoring_tests/scale_test/json_files/profile.json
@@ -0,0 +1,28 @@
+{
+  "name": "resource-optimization-openshift",
+  "profile_version": 1,
+  "k8s_type": "openshift",
+  "slo": {
+    "slo_class": "resource_usage",
+    "direction": "minimize",
+    "objective_function": {
+      "function_type": "expression",
+      "expression": "cpuRequest"
+    },
+    "function_variables": [
+      {
+        "name": "cpuRequest",
+        "datasource": "prometheus",
+        "value_type": "double",
+        "kubernetes_object": "container",
+        "query": "kube_pod_container_resource_requests{pod=~'$DEPLOYMENT_NAME$-[^-]*-[^-]*$', container='$CONTAINER_NAME$', namespace='$NAMESPACE', resource='cpu', unit='core'}",
+        "aggregation_functions": [
+          {
+            "function": "avg",
+            "query": "avg(kube_pod_container_resource_requests{pod=~\"$DEPLOYMENT_NAME$-[^-]*-[^-]*$\", container=\"$CONTAINER_NAME$\", namespace=\"$NAMESPACE\", resource=\"cpu\", unit=\"core\"})"
+          }
+        ]
+      }
+    ]
+  }
+}
diff --git a/tests/scripts/remote_monitoring_tests/scale_test/json_files/results.json b/tests/scripts/remote_monitoring_tests/scale_test/json_files/results.json
@@ -0,0 +1,201 @@
+{
+  "version": "3.0",
+  "experiment_name": "quarkus-resteasy-kruize-min-http-response-time-db_4",
+  "interval_start_time": "2023-01-01T00:00:00.000Z",
+  "interval_end_time": "2023-01-01T00:00:00.000Z",
+  "kubernetes_objects": [
+    {
+      "type": "deployment",
+      "name": "tfb-qrh-deployment_5",
+      "namespace": "default_5",
+      "containers": [
+        {
+          "container_image_name": "kruize/tfb-db:1.15",
+          "container_name": "tfb-server-0",
+          "metrics": [
+            {
+              "name": "cpuRequest",
+              "results": {
+                "aggregation_info": {
+                  "sum": 0,
+                  "avg": 0,
+                  "format": "cores"
+                }
+              }
+            },
+            {
+              "name": "cpuLimit",
+              "results": {
+                "aggregation_info": {
+                  "sum": 0,
+                  "avg": 0,
+                  "format": "cores"
+                }
+              }
+            },
+            {
+              "name": "cpuUsage",
+              "results": {
+                "aggregation_info": {
+                  "min": 0,
+                  "max": 0,
+                  "sum": 0,
+                  "avg": 0,
+                  "format": "cores"
+                }
+              }
+            },
+            {
+              "name": "cpuThrottle",
+              "results": {
+                "aggregation_info": {
+                  "sum": 0,
+                  "max": 0,
+                  "avg": 0,
+                  "format": "cores"
+                }
+              }
+            },
+            {
+              "name": "memoryRequest",
+              "results": {
+                "aggregation_info": {
+                  "sum": 260.85,
+                  "avg": 50.21,
+                  "format": "MiB"
+                }
+              }
+            },
+            {
+              "name": "memoryLimit",
+              "results": {
+                "aggregation_info": {
+                  "sum": 700,
+                  "avg": 100,
+                  "format": "MiB"
+                }
+              }
+            },
+            {
+              "name": "memoryUsage",
+              "results": {
+                "aggregation_info": {
+                  "min": 50.6,
+                  "max": 198.5,
+                  "sum": 298.5,
+                  "avg": 40.1,
+                  "format": "MiB"
+                }
+              }
+            },
+            {
+              "name": "memoryRSS",
+              "results": {
+                "aggregation_info": {
+                  "min": 50.6,
+                  "max": 523.6,
+                  "sum": 123.6,
+                  "avg": 31.91,
+                  "format": "MiB"
+                }
+              }
+            }
+          ]
+        },
+        {
+          "container_image_name": "kruize/tfb-qrh:1.13.2.F_et17",
+          "container_name": "tfb-server-1",
+          "metrics": [
+            {
+              "name": "cpuRequest",
+              "results": {
+                "aggregation_info": {
+                  "sum": 4.4,
+                  "avg": 1.1,
+                  "format": "cores"
+                }
+              }
+            },
+            {
+              "name": "cpuLimit",
+              "results": {
+                "aggregation_info": {
+                  "sum": 2,
+                  "avg": 0.5,
+                  "format": "cores"
+                }
+              }
+            },
+            {
+              "name": "cpuUsage",
+              "results": {
+                "aggregation_info": {
+                  "min": 0.14,
+                  "max": 0.84,
+                  "sum": 0.84,
+                  "avg": 0.12,
+                  "format": "cores"
+                }
+              }
+            },
+            {
+              "name": "cpuThrottle",
+              "results": {
+                "aggregation_info": {
+                  "sum": 0.19,
+                  "max": 0.09,
+                  "avg": 0.045,
+                  "format": "cores"
+                }
+              }
+            },
+            {
+              "name": "memoryRequest",
+              "results": {
+                "aggregation_info": {
+                  "sum": 250.85,
+                  "avg": 50.21,
+                  "format": "MiB"
+                }
+              }
+            },
+            {
+              "name": "memoryLimit",
+              "results": {
+                "aggregation_info": {
+                  "sum": 500,
+                  "avg": 100,
+                  "format": "MiB"
+                }
+              }
+            },
+            {
+              "name": "memoryUsage",
+              "results": {
+                "aggregation_info": {
+                  "min": 50.6,
+                  "max": 198.5,
+                  "sum": 198.5,
+                  "avg": 40.1,
+                  "format": "MiB"
+                }
+              }
+            },
+            {
+              "name": "memoryRSS",
+              "results": {
+                "aggregation_info": {
+                  "min": 50.6,
+                  "max": 123.6,
+                  "sum": 123.6,
+                  "avg": 31.91,
+                  "format": "MiB"
+                }
+              }
+            }
+          ]
+        }
+      ]
+    }
+  ]
+}