stretch mode: Fixing TFA issues identified in stretch mode execution

Signed-off-by: Pawan Dhiran <[email protected]>
red-hat-storage · Jan 13, 2025 · aa7b6cb · aa7b6cb
1 parent aa2b0d5
commit aa7b6cb
Show file tree

Hide file tree

Showing 8 changed files with 150 additions and 77 deletions.
diff --git a/suites/pacific/rados/tier-3_rados_test-location-stretch-mode.yaml b/suites/pacific/rados/tier-3_rados_test-location-stretch-mode.yaml
@@ -319,17 +319,6 @@ tests:
         delete_pool: true
       desc: Test stretch Cluster netsplit scenario between data sites
 
-  - test:
-      name: OSD and host replacement
-      module: test_stretch_osd_serviceability_scenarios.py
-      polarion-id: CEPH-83575474
-      config:
-        pool_name: test_stretch_pool7
-        stretch_bucket: datacenter
-        tiebreaker_mon_site_name: tiebreaker
-        delete_pool: true
-      desc: Test stretch Cluster osd and Host replacement
-
   - test:
       name: Netsplit Scenarios data-tiebreaker sites
       module: test_stretch_netsplit_scenarios.py
@@ -360,3 +349,15 @@ tests:
         stretch_bucket: datacenter
         tiebreaker_mon_site_name: tiebreaker
       desc: Perform post-deployment negative tests on stretch mode
+      comments: -ve scenarios bug - 2293147
+
+  - test:
+      name: OSD and host replacement
+      module: test_stretch_osd_serviceability_scenarios.py
+      polarion-id: CEPH-83575474
+      config:
+        pool_name: test_stretch_pool7
+        stretch_bucket: datacenter
+        tiebreaker_mon_site_name: tiebreaker
+        delete_pool: true
+      desc: Test stretch Cluster osd and Host replacement
diff --git a/suites/quincy/rados/tier-3_rados_test-location-stretch-mode.yaml b/suites/quincy/rados/tier-3_rados_test-location-stretch-mode.yaml
@@ -319,17 +319,6 @@ tests:
         delete_pool: true
       desc: Test stretch Cluster netsplit scenario between data sites
 
-  - test:
-      name: OSD and host replacement
-      module: test_stretch_osd_serviceability_scenarios.py
-      polarion-id: CEPH-83575474
-      config:
-        pool_name: test_stretch_pool7
-        stretch_bucket: datacenter
-        tiebreaker_mon_site_name: tiebreaker
-        delete_pool: true
-      desc: Test stretch Cluster osd and Host replacement
-
   - test:
       name: Netsplit Scenarios data-tiebreaker sites
       module: test_stretch_netsplit_scenarios.py
@@ -352,6 +341,7 @@ tests:
         delete_pool: true
       desc: Test the cluster when the Data site is moved to maintenance mode
 
+
   - test:
       name: Negative scenarios - post-deployment
       module: test_stretch_negative_scenarios.py
@@ -360,3 +350,15 @@ tests:
         stretch_bucket: datacenter
         tiebreaker_mon_site_name: tiebreaker
       desc: Perform post-deployment negative tests on stretch mode
+      comments: -ve scenarios bug - 2293147
+
+  - test:
+      name: OSD and host replacement
+      module: test_stretch_osd_serviceability_scenarios.py
+      polarion-id: CEPH-83575474
+      config:
+        pool_name: test_stretch_pool7
+        stretch_bucket: datacenter
+        tiebreaker_mon_site_name: tiebreaker
+        delete_pool: true
+      desc: Test stretch Cluster osd and Host replacement
diff --git a/suites/reef/rados/tier-3_rados_test-location-stretch-mode.yaml b/suites/reef/rados/tier-3_rados_test-location-stretch-mode.yaml
@@ -333,17 +333,6 @@ tests:
       desc: Test stretch Cluster netsplit scenario between data sites
       comments: Active bug - 2318975
 
-  - test:
-      name: OSD and host replacement
-      module: test_stretch_osd_serviceability_scenarios.py
-      polarion-id: CEPH-83575474
-      config:
-        pool_name: test_stretch_pool7
-        stretch_bucket: datacenter
-        tiebreaker_mon_site_name: tiebreaker
-        delete_pool: true
-      desc: Test stretch Cluster osd and Host replacement
-
   - test:
       name: Netsplit Scenarios data-tiebreaker sites
       module: test_stretch_netsplit_scenarios.py
@@ -374,3 +363,15 @@ tests:
         stretch_bucket: datacenter
         tiebreaker_mon_site_name: tiebreaker
       desc: Perform post-deployment negative tests on stretch mode
+      comments: -ve scenarios bug - 2293147
+
+  - test:
+      name: OSD and host replacement
+      module: test_stretch_osd_serviceability_scenarios.py
+      polarion-id: CEPH-83575474
+      config:
+        pool_name: test_stretch_pool7
+        stretch_bucket: datacenter
+        tiebreaker_mon_site_name: tiebreaker
+        delete_pool: true
+      desc: Test stretch Cluster osd and Host replacement
diff --git a/suites/squid/rados/tier-2_rados_test-brownfield.yaml b/suites/squid/rados/tier-2_rados_test-brownfield.yaml
@@ -146,14 +146,14 @@ tests:
       polarion-id: CEPH-83574439
       abort-on-fail: false
 
-# commented until fix merged in Squid: #2326892
-#  - test:
-#      name: "issue repro-obj snap and pool snap deletion"
-#      module: test_pool_snap.py
-#      desc: obj snap deletion when pool snapshot is deleted on unfixed build
-#      polarion-id: CEPH-83602685
-#      config:
-#        issue_reproduction: true
+# commented until fix merged in Squid: #2326892 - Issue fixed
+  - test:
+      name: "issue repro-obj snap and pool snap deletion"
+      module: test_pool_snap.py
+      desc: obj snap deletion when pool snapshot is deleted on unfixed build
+      polarion-id: CEPH-83602685
+      config:
+        issue_reproduction: true
 
   - test:
       name: Upgrade cluster to latest 8.x ceph version
@@ -169,14 +169,14 @@ tests:
       destroy-cluster: false
       abort-on-fail: true
 
-# commented until fix merged in Squid: #2326892
-#  - test:
-#      name: "verify fix-obj snap and pool snap deletion"
-#      module: test_pool_snap.py
-#      desc: obj snap deletion when pool snapshot is deleted on fixed build
-#      polarion-id: CEPH-83602685
-#      config:
-#        verify_fix: true
+# commented until fix merged in Squid: #2326892 - Issue fixed
+  - test:
+      name: "verify fix-obj snap and pool snap deletion"
+      module: test_pool_snap.py
+      desc: obj snap deletion when pool snapshot is deleted on fixed build
+      polarion-id: CEPH-83602685
+      config:
+        verify_fix: true
 
   # Running basic rbd and rgw tests after upgrade
   - test:

diff --git a/suites/squid/rados/tier-3_rados_test-location-stretch-mode.yaml b/suites/squid/rados/tier-3_rados_test-location-stretch-mode.yaml
@@ -4,7 +4,6 @@
 # This test case is Openstack only and cannot be run in Baremetal env due to test constrains.
 # Stretch mode deployment in BM is run by suite : suites/squid/rados/deploy-stretch-cluster-mode.yaml
 
-# RHOS-d run duration: 300 mins
 tests:
   - test:
       name: Install ceph pre-requisites
@@ -332,17 +331,6 @@ tests:
       desc: Test stretch Cluster netsplit scenario between data sites
       comments: Active bug - 2318975
 
-  - test:
-      name: OSD and host replacement
-      module: test_stretch_osd_serviceability_scenarios.py
-      polarion-id: CEPH-83575474
-      config:
-        pool_name: test_stretch_pool7
-        stretch_bucket: datacenter
-        tiebreaker_mon_site_name: tiebreaker
-        delete_pool: true
-      desc: Test stretch Cluster osd and Host replacement
-
   - test:
       name: Netsplit Scenarios data-tiebreaker sites
       module: test_stretch_netsplit_scenarios.py
@@ -373,3 +361,15 @@ tests:
         stretch_bucket: datacenter
         tiebreaker_mon_site_name: tiebreaker
       desc: Perform post-deployment negative tests on stretch mode
+      comments: -ve scenarios bug - 2293147
+
+  - test:
+      name: OSD and host replacement
+      module: test_stretch_osd_serviceability_scenarios.py
+      polarion-id: CEPH-83575474
+      config:
+        pool_name: test_stretch_pool7
+        stretch_bucket: datacenter
+        tiebreaker_mon_site_name: tiebreaker
+        delete_pool: true
+      desc: Test stretch Cluster osd and Host replacement
diff --git a/tests/rados/stretch_cluster.py b/tests/rados/stretch_cluster.py
@@ -75,13 +75,20 @@ def run(ceph_cluster, **kw):
         log.info("Verifying forced recovery and healthy in stretch environment")
 
         pool_name = "stretch_pool_recovery"
-        if not rados_obj.create_pool(pool_name=pool_name, pg_num=16):
+        if not rados_obj.create_pool(pool_name=pool_name):
             log.error("Failed to create the replicated Pool")
             return 1
 
         # getting the acting set for the created pool
         acting_pg_set = rados_obj.get_pg_acting_set(pool_name=pool_name)
 
+        # Getting the number of objects, to check if writes were successful later post recovery
+        pool_stat_init = rados_obj.get_cephdf_stats(pool_name=pool_name)
+        init_objects = pool_stat_init["stats"]["objects"]
+        log.debug(
+            f"initial number of objects on the pool : {pool_name} is {init_objects}"
+        )
+
         log.info(
             f"Killing 2 OSD's from acting set : {acting_pg_set} to verify recovery"
         )
@@ -91,20 +98,56 @@ def run(ceph_cluster, **kw):
                 log.error(f"Unable to stop the OSD : {osd_id}")
                 return 1
 
-        # Sleeping for 25 seconds ( "osd_heartbeat_grace": "20" ) for osd's to be marked down
-        time.sleep(25)
+        rados_obj.change_recovery_threads(config={}, action="set")
+
+        # Guide link : https://docs.ceph.com/en/reef/rados/configuration/mon-osd-interaction/#monitor-settings
+        log.info(
+            "Updating the behaviour of the test case. It is expected that PGs will enter peered state when there"
+            "is partial site failure. below bugzilla has more details on the behaviour "
+            "Bugzilla : https://bugzilla.redhat.com/show_bug.cgi?id=2328649 "
+        )
+
+        log.info(
+            "'osd_heartbeat_grace': 20 sec, & 'mon_osd_down_out_interval': 10 min "
+            "Sleeping for 10 minutes for osd's to be marked down & out of the cluster, so that recovery starts"
+        )
+        time.sleep(60 * 11)
 
         log.info("Stopped 2 OSD's from acting set, starting to wait for recovery")
 
         if not rados_obj.bench_write(pool_name=pool_name, **config):
             log.error("Failed to write objects into the Pool")
             return 1
 
+        pool_stat_intrim = rados_obj.get_cephdf_stats(pool_name=pool_name)
+        intrim_objects = pool_stat_intrim["stats"]["objects"]
+        log.debug(
+            f"number of objects on the pool with OSD down & out: {pool_name} is {intrim_objects}"
+        )
+
         log.debug("Triggering forced recovery in stretch mode")
         cmd = "ceph osd force_recovery_stretch_mode --yes-i-really-mean-it"
         rados_obj.run_ceph_command(cmd)
         log.info("Triggered the recovery in stretch mode")
 
+        # Objects should be more than the initial no of objects
+        if int(intrim_objects) <= int(init_objects):
+            log.error(
+                "Write ops should be possible, number of objects in the pool has not changed"
+            )
+            log.info(
+                f"cluster output dumps:\n"
+                f"ceph status : \n{rados_obj.run_ceph_command(cmd='ceph -s')}\n"
+                f"health detail : \n{rados_obj.run_ceph_command(cmd='ceph health detail')}\n"
+                f"mon dump : \n{rados_obj.run_ceph_command(cmd='ceph mon dump')}\n"
+                f"ceph report: \n {rados_obj.run_ceph_command(cmd='ceph report')}\n"
+                f"osd tree : \n{rados_obj.run_ceph_command(cmd='ceph osd df tree')}\n"
+            )
+            raise Exception(f"Pool {pool_name} has {intrim_objects} objs")
+        log.info(
+            f"Successfully wrote {int(intrim_objects) - int(init_objects)} on pool {pool_name} in degraded mode\n"
+        )
+
         log.debug("Starting the stopped OSD's")
         for osd_id in stop_osds:
             if not rados_obj.change_osd_state(action="restart", target=osd_id):
@@ -123,6 +166,8 @@ def run(ceph_cluster, **kw):
         rados_obj.run_ceph_command(cmd)
         rados_obj.delete_pool(pool=pool_name)
 
+        rados_obj.change_recovery_threads(config={}, action="rm")
+
         log.info("Cluster has successfully recovered and is in healthy state")
         return 0
 

diff --git a/tests/rados/test_pool_snap.py b/tests/rados/test_pool_snap.py
@@ -21,6 +21,7 @@ def run(ceph_cluster, **kw):
         - Pacific: BZ-2272361
         - Quincy: BZ-2272362
         - Reef: BZ-2263169
+        - Squid: BZ-2326892
     Test to verify object snap deletion when parent pool snapshot is deleted.
     1. Create a replicated pool with default config
     2. Use rados put to write a single object to the pool