Merge pull request #228 from red-hat-storage/sync_us--main

Syncing latest changes from upstream main for ramen
red-hat-storage · Apr 2, 2024 · 5e984e4 · 5e984e4
2 parents ea5c8fe + c98cd12
commit 5e984e4
Show file tree

Hide file tree

Showing 16 changed files with 372 additions and 66 deletions.
diff --git a/.github/workflows/e2e.yaml b/.github/workflows/e2e.yaml
@@ -30,9 +30,6 @@ jobs:
     - name: Install ramenctl
       run: pip install -e ramenctl
 
-    - name: Build ramen-operator container
-      run: make docker-build
-
     - name: Delete clusters
       if: ${{ always() }}
       working-directory: test
@@ -50,6 +47,9 @@ jobs:
           cd test
           drenv start --max-workers ${{ env.MAX_WORKERS }} --name-prefix ${{ env.NAME_PREFIX }} envs/regional-dr.yaml
 
+    - name: Build ramen-operator container
+      run: make docker-build
+
     - name: Deploy ramen
       run: ramenctl deploy --name-prefix ${{ env.NAME_PREFIX }} test/envs/regional-dr.yaml
 

diff --git a/Makefile b/Makefile
@@ -183,7 +183,7 @@ test-drenv: ## Run drenv tests.
 test-ramenctl: ## Run ramenctl tests.
 	$(MAKE) -C ramenctl
 
-e2e-rdr: generate manifests docker-build ## Run rdr-e2e tests.
+e2e-rdr: generate manifests ## Run rdr-e2e tests.
 	./e2e/rdr-e2e.sh
 
 coverage:

diff --git a/docs/user-quick-start.md b/docs/user-quick-start.md
@@ -149,22 +149,17 @@ enough resources:
    for the details.
    Tested with version v1.12.0.
 
-1. Install the `virtctl` tool. See
-   [virtctl install](https://kubevirt.io/quickstart_minikube/#virtctl)
-   for the details.
-   Tested with version v1.0.1.
+1. Install the `virtctl` tool.
 
    ```
-   curl -L -o virtctl https://github.com/kubevirt/kubevirt/releases/download/v1.0.1/virtctl-v1.0.1-linux-amd64
-   ```
-
-   After download completes for `virtctl` issue these commands.
-
-   ```
-   chmod +x virtctl
+   curl -L -o virtctl https://github.com/kubevirt/kubevirt/releases/download/v1.2.0/virtctl-v1.2.0-linux-amd64
    sudo install virtctl /usr/local/bin
+   rm virtctl
    ```
 
+   For more info see
+   [virtctl install](https://kubevirt.io/quickstart_minikube/#virtctl)
+
 1. Install `mc` tool
 
    ```

diff --git a/hack/install-setup-envtest.sh b/hack/install-setup-envtest.sh
@@ -3,7 +3,7 @@ set -e
 
 script_dir="$(cd "$(dirname "$0")" && pwd)"
 
-required_version="latest"
+required_version="release-0.17"
 source_url="sigs.k8s.io/controller-runtime/tools/setup-envtest@${required_version}"
 target_dir="${script_dir}/../testbin"
 target_path="${target_dir}/setup-envtest"

diff --git a/test/README.md b/test/README.md
@@ -58,10 +58,16 @@ environment.
    See [Installing Helm](https://helm.sh/docs/intro/install/) for other options.
    Tested with version v3.11.
 
-1. Install the `virtctl` tool. See
+1. Install the `virtctl` tool
+
+   ```
+   curl -L -o virtctl https://github.com/kubevirt/kubevirt/releases/download/v1.2.0/virtctl-v1.2.0-linux-amd64
+   sudo install virtctl /usr/local/bin
+   rm virtctl
+   ```
+
+   For more info see
    [virtctl install](https://kubevirt.io/quickstart_minikube/#virtctl)
-   for the details.
-   Tested with version v1.0.1.
 
 1. Install `mc` tool
 

diff --git a/test/addons/csi-addons/start b/test/addons/csi-addons/start
@@ -9,7 +9,7 @@ import sys
 import drenv
 from drenv import kubectl
 
-VERSION = "v0.7.0"
+VERSION = "v0.8.0"
 BASE_URL = f"https://raw.githubusercontent.com/csi-addons/kubernetes-csi-addons/{VERSION}/deploy/controller"
 
 

diff --git a/test/addons/rbd-mirror/start b/test/addons/rbd-mirror/start
@@ -9,34 +9,11 @@ import os
 import sys
 
 import drenv
-from drenv import ceph
 from drenv import kubectl
 
 POOL_NAME = "replicapool"
 
 
-def clear_blocklist(cluster):
-    """
-    Clear ceph blocklist.
-
-    TODO: Maybe it is better to fail.
-    """
-    blocklist = ceph.list_osd_blocklist(cluster)
-    if blocklist:
-        print(f"Clearing ceph osd blocklist on cluster {cluster}")
-        print(json.dumps(blocklist, indent=2))
-        ceph.clear_osd_blocklist(cluster)
-
-
-def check_blocklist(cluster):
-    """
-    Fail if ceph osd blocklist is not empty.
-    """
-    blocklist = ceph.list_osd_blocklist(cluster)
-    if blocklist:
-        raise RuntimeError(f"Ceph blocklist on cluster {cluster}: {blocklist}")
-
-
 def fetch_secret_info(cluster):
     info = {}
 
@@ -152,9 +129,6 @@ os.chdir(os.path.dirname(__file__))
 cluster1 = sys.argv[1]
 cluster2 = sys.argv[2]
 
-clear_blocklist(cluster1)
-clear_blocklist(cluster2)
-
 cluster1_info = fetch_secret_info(cluster1)
 cluster2_info = fetch_secret_info(cluster2)
 
@@ -170,7 +144,4 @@ wait_until_pool_mirroring_is_healthy(cluster2)
 deploy_vrc_sample(cluster1)
 deploy_vrc_sample(cluster2)
 
-check_blocklist(cluster1)
-check_blocklist(cluster2)
-
 print("Mirroring was setup successfully")
diff --git a/test/addons/rbd-mirror/test b/test/addons/rbd-mirror/test
@@ -8,22 +8,12 @@ import os
 import sys
 import time
 
-from drenv import ceph
 from drenv import kubectl
 
 POOL_NAME = "replicapool"
 PVC_NAME = "rbd-pvc"
 
 
-def check_blocklist(cluster):
-    """
-    Fail if ceph osd blocklist is not empty.
-    """
-    blocklist = ceph.list_osd_blocklist(cluster)
-    if blocklist:
-        raise RuntimeError(f"Ceph blocklist on cluster {cluster}: {blocklist}")
-
-
 def rbd(*args, cluster=None):
     """
     Run a rbd command using the ceph toolbox on the specified cluster and
@@ -174,11 +164,5 @@ os.chdir(os.path.dirname(__file__))
 cluster1 = sys.argv[1]
 cluster2 = sys.argv[2]
 
-check_blocklist(cluster1)
-check_blocklist(cluster2)
-
 test_volume_replication(cluster1, cluster2)
 test_volume_replication(cluster2, cluster1)
-
-check_blocklist(cluster1)
-check_blocklist(cluster2)
diff --git a/test/configs/kubevirt/vm-dv-odr-regional.yaml b/test/configs/kubevirt/vm-dv-odr-regional.yaml
@@ -0,0 +1,11 @@
+# SPDX-FileCopyrightText: The RamenDR authors
+# SPDX-License-Identifier: Apache-2.0
+
+---
+repo: https://github.com/ramendr/ocm-ramen-samples.git
+path: subscription/kubevirt/vm-dv-odr-regional
+branch: main
+name: vm-dv
+namespace: vm-dv
+dr_policy: dr-policy
+pvc_label: vm
diff --git a/test/configs/kubevirt/vm-dvt-odr-regional.yaml b/test/configs/kubevirt/vm-dvt-odr-regional.yaml
@@ -0,0 +1,11 @@
+# SPDX-FileCopyrightText: The RamenDR authors
+# SPDX-License-Identifier: Apache-2.0
+
+---
+repo: https://github.com/ramendr/ocm-ramen-samples.git
+path: subscription/kubevirt/vm-dvt-odr-regional
+branch: main
+name: vm-dvt
+namespace: vm-dvt
+dr_policy: dr-policy
+pvc_label: vm
diff --git a/test/configs/kubevirt/vm-pvc-odr-regional.yaml b/test/configs/kubevirt/vm-pvc-odr-regional.yaml
@@ -0,0 +1,11 @@
+# SPDX-FileCopyrightText: The RamenDR authors
+# SPDX-License-Identifier: Apache-2.0
+
+---
+repo: https://github.com/ramendr/ocm-ramen-samples.git
+path: subscription/kubevirt/vm-pvc-odr-regional
+branch: main
+name: vm-pvc
+namespace: vm-pvc
+dr_policy: dr-policy
+pvc_label: vm
diff --git a/test/envs/odr.yaml b/test/envs/odr.yaml
@@ -0,0 +1,14 @@
+# SPDX-FileCopyrightText: The RamenDR authors
+# SPDX-License-Identifier: Apache-2.0
+
+# Environment for testing an external OpenShift Regional-DR setup using
+# basic-test.  Assuming that you used `oc login` to get all the clusters in a
+# kubeconfig file used by the test.
+---
+name: perf123
+ramen:
+  hub: perf1
+  clusters: [perf2, perf3]
+  topology: regional-dr
+  features:
+    volsync: true
diff --git a/test/scripts/network.xml b/test/scripts/network.xml
@@ -0,0 +1,10 @@
+<network>
+  <name>default</name>
+  <bridge name='virbr0'/>
+  <forward/>
+  <ip address='192.168.22.1' netmask='255.255.255.0'>
+    <dhcp>
+      <range start='192.168.22.2' end='192.168.22.254'/>
+    </dhcp>
+  </ip>
+</network>
diff --git a/test/scripts/setup-libvirt b/test/scripts/setup-libvirt
@@ -1,11 +1,13 @@
 #!/bin/sh -e
 
+base=$(dirname $0)
+
 default_network_exists() {
     virsh -c qemu:///system net-list | grep -q default
 }
 
 create_default_network() {
-    virsh -c qemu:///system net-define /usr/share/libvirt/networks/default.xml
+    virsh -c qemu:///system net-define $base/network.xml
     virsh -c qemu:///system net-autostart default
     virsh -c qemu:///system net-start default
 }

diff --git a/test/stress-test/README.md b/test/stress-test/README.md
@@ -0,0 +1,136 @@
+# drenv stress test
+
+This directory includes the drenv stress test for evaluating `drenv start`
+robustness and debugging failed runs.
+
+The test support 2 modes of operation:
+
+- Collecting stats from long unattended test run
+- Debugging a failed run
+
+## Collecting stats
+
+In this example we run 100 runs starting the regional-dr environment. If
+a run fails, we delete the clusters and continue. This is useful for
+understanding what are the most common failures.
+
+```
+stress-test/run -r 100 ../envs/regional-dr.yaml
+```
+
+This creates the `out` directory in the current directory, logging each
+run in a separate log file, and saving test results in `test.json` file.
+This run took more than 17 hours (626 seconds per build):
+
+```
+$ ls out
+000.log  013.log  026.log  039.log  052.log  065.log  078.log  091.log
+001.log  014.log  027.log  040.log  053.log  066.log  079.log  092.log
+002.log  015.log  028.log  041.log  054.log  067.log  080.log  093.log
+003.log  016.log  029.log  042.log  055.log  068.log  081.log  094.log
+004.log  017.log  030.log  043.log  056.log  069.log  082.log  095.log
+005.log  018.log  031.log  044.log  057.log  070.log  083.log  096.log
+006.log  019.log  032.log  045.log  058.log  071.log  084.log  097.log
+007.log  020.log  033.log  046.log  059.log  072.log  085.log  098.log
+008.log  021.log  034.log  047.log  060.log  073.log  086.log  099.log
+009.log  022.log  035.log  048.log  061.log  074.log  087.log  test.json
+010.log  023.log  036.log  049.log  062.log  075.log  088.log
+011.log  024.log  037.log  050.log  063.log  076.log  089.log
+012.log  025.log  038.log  051.log  064.log  077.log  090.log
+```
+
+To get test stats:
+
+```
+$ cat out/test.json | jq .stats
+{
+  "runs": 100,
+  "passed": 84,
+  "failed": 16,
+  "success": 84.0,
+  "time": 62694.784591522985,
+  "passed-time": 52647.00043984903,
+  "failed-time": 9723.622515744006
+}
+```
+
+To find the failed runs you can use look up the individual tests
+results:
+
+```
+$ cat out/test.json
+...
+    {
+      "name": "007",
+      "passed": false,
+      "time": 460.2368865620083
+    },
+```
+
+Or grep the logs:
+
+```
+$ grep ^drenv.commands.Error out/*.log
+out/007.log:drenv.commands.Error: Command failed:
+out/014.log:drenv.commands.Error: Command failed:
+out/026.log:drenv.commands.Error: Command failed:
+out/027.log:drenv.commands.Error: Command failed:
+out/028.log:drenv.commands.Error: Command failed:
+out/031.log:drenv.commands.Error: Command failed:
+out/036.log:drenv.commands.Error: Command failed:
+out/043.log:drenv.commands.Error: Command failed:
+out/044.log:drenv.commands.Error: Command failed:
+out/051.log:drenv.commands.Error: Command failed:
+out/052.log:drenv.commands.Error: Command failed:
+out/066.log:drenv.commands.Error: Command failed:
+out/074.log:drenv.commands.Error: Command failed:
+out/075.log:drenv.commands.Error: Command failed:
+out/085.log:drenv.commands.Error: Command failed:
+out/089.log:drenv.commands.Error: Command failed:
+```
+
+## Debugging a failed run
+
+In this mode the run exit cleanly after the first failure, leaving the
+cluster running for inspection.
+
+```
+stress-test/run -r 100 -x ../envs/regional-dr.yaml
+```
+
+Because the failures are random, a run may fail very quickly or only
+after many hours. As drenv becomes more reliable debugging random
+failures will become harder.
+
+> [!IMPORTANT]
+> After debugging the failure, you need to delete the environment
+> manually.
+
+In this example the run failed after the first run:
+
+```
+$ ls out.3:
+out.3:
+000.log  test.json
+```
+
+And here after 20 runs:
+
+```
+$ ls out.4:
+000.log  003.log  006.log  009.log  012.log  015.log  018.log  021.log
+001.log  004.log  007.log  010.log  013.log  016.log  019.log  test.json
+002.log  005.log  008.log  011.log  014.log  017.log  020.log
+```
+
+In both case the last run is the failure:
+
+```
+$ grep ^drenv.commands.Error out.[34]/*.log
+out.3/000.log:drenv.commands.Error: Command failed:
+out.4/021.log:drenv.commands.Error: Command failed:
+```
+
+The clusters are running, hopefully in the same state when the run
+failed. Sometimes the cluster fixed itself after the failure, this
+usually means some timeout was too short.