diff --git a/suites/pacific/rados/tier-3_rados_test-location-stretch-mode.yaml b/suites/pacific/rados/tier-3_rados_test-location-stretch-mode.yaml index d27e26f996e..51f3828b23b 100644 --- a/suites/pacific/rados/tier-3_rados_test-location-stretch-mode.yaml +++ b/suites/pacific/rados/tier-3_rados_test-location-stretch-mode.yaml @@ -319,17 +319,6 @@ tests: delete_pool: true desc: Test stretch Cluster netsplit scenario between data sites - - test: - name: OSD and host replacement - module: test_stretch_osd_serviceability_scenarios.py - polarion-id: CEPH-83575474 - config: - pool_name: test_stretch_pool7 - stretch_bucket: datacenter - tiebreaker_mon_site_name: tiebreaker - delete_pool: true - desc: Test stretch Cluster osd and Host replacement - - test: name: Netsplit Scenarios data-tiebreaker sites module: test_stretch_netsplit_scenarios.py @@ -360,3 +349,15 @@ tests: stretch_bucket: datacenter tiebreaker_mon_site_name: tiebreaker desc: Perform post-deployment negative tests on stretch mode + comments: -ve scenarios bug - 2293147 + + - test: + name: OSD and host replacement + module: test_stretch_osd_serviceability_scenarios.py + polarion-id: CEPH-83575474 + config: + pool_name: test_stretch_pool7 + stretch_bucket: datacenter + tiebreaker_mon_site_name: tiebreaker + delete_pool: true + desc: Test stretch Cluster osd and Host replacement diff --git a/suites/quincy/rados/tier-3_rados_test-location-stretch-mode.yaml b/suites/quincy/rados/tier-3_rados_test-location-stretch-mode.yaml index 32ec941eae9..902a0f1db58 100644 --- a/suites/quincy/rados/tier-3_rados_test-location-stretch-mode.yaml +++ b/suites/quincy/rados/tier-3_rados_test-location-stretch-mode.yaml @@ -319,17 +319,6 @@ tests: delete_pool: true desc: Test stretch Cluster netsplit scenario between data sites - - test: - name: OSD and host replacement - module: test_stretch_osd_serviceability_scenarios.py - polarion-id: CEPH-83575474 - config: - pool_name: test_stretch_pool7 - stretch_bucket: datacenter - tiebreaker_mon_site_name: tiebreaker - delete_pool: true - desc: Test stretch Cluster osd and Host replacement - - test: name: Netsplit Scenarios data-tiebreaker sites module: test_stretch_netsplit_scenarios.py @@ -352,6 +341,7 @@ tests: delete_pool: true desc: Test the cluster when the Data site is moved to maintenance mode + - test: name: Negative scenarios - post-deployment module: test_stretch_negative_scenarios.py @@ -360,3 +350,15 @@ tests: stretch_bucket: datacenter tiebreaker_mon_site_name: tiebreaker desc: Perform post-deployment negative tests on stretch mode + comments: -ve scenarios bug - 2293147 + + - test: + name: OSD and host replacement + module: test_stretch_osd_serviceability_scenarios.py + polarion-id: CEPH-83575474 + config: + pool_name: test_stretch_pool7 + stretch_bucket: datacenter + tiebreaker_mon_site_name: tiebreaker + delete_pool: true + desc: Test stretch Cluster osd and Host replacement diff --git a/suites/reef/rados/tier-3_rados_test-location-stretch-mode.yaml b/suites/reef/rados/tier-3_rados_test-location-stretch-mode.yaml index 870c219ae21..59deb0436c6 100644 --- a/suites/reef/rados/tier-3_rados_test-location-stretch-mode.yaml +++ b/suites/reef/rados/tier-3_rados_test-location-stretch-mode.yaml @@ -333,17 +333,6 @@ tests: desc: Test stretch Cluster netsplit scenario between data sites comments: Active bug - 2318975 - - test: - name: OSD and host replacement - module: test_stretch_osd_serviceability_scenarios.py - polarion-id: CEPH-83575474 - config: - pool_name: test_stretch_pool7 - stretch_bucket: datacenter - tiebreaker_mon_site_name: tiebreaker - delete_pool: true - desc: Test stretch Cluster osd and Host replacement - - test: name: Netsplit Scenarios data-tiebreaker sites module: test_stretch_netsplit_scenarios.py @@ -374,3 +363,15 @@ tests: stretch_bucket: datacenter tiebreaker_mon_site_name: tiebreaker desc: Perform post-deployment negative tests on stretch mode + comments: -ve scenarios bug - 2293147 + + - test: + name: OSD and host replacement + module: test_stretch_osd_serviceability_scenarios.py + polarion-id: CEPH-83575474 + config: + pool_name: test_stretch_pool7 + stretch_bucket: datacenter + tiebreaker_mon_site_name: tiebreaker + delete_pool: true + desc: Test stretch Cluster osd and Host replacement diff --git a/suites/squid/rados/tier-2_rados_test-brownfield.yaml b/suites/squid/rados/tier-2_rados_test-brownfield.yaml index 3f995d50583..0a07995f10e 100644 --- a/suites/squid/rados/tier-2_rados_test-brownfield.yaml +++ b/suites/squid/rados/tier-2_rados_test-brownfield.yaml @@ -146,14 +146,14 @@ tests: polarion-id: CEPH-83574439 abort-on-fail: false -# commented until fix merged in Squid: #2326892 -# - test: -# name: "issue repro-obj snap and pool snap deletion" -# module: test_pool_snap.py -# desc: obj snap deletion when pool snapshot is deleted on unfixed build -# polarion-id: CEPH-83602685 -# config: -# issue_reproduction: true +# commented until fix merged in Squid: #2326892 - Issue fixed + - test: + name: "issue repro-obj snap and pool snap deletion" + module: test_pool_snap.py + desc: obj snap deletion when pool snapshot is deleted on unfixed build + polarion-id: CEPH-83602685 + config: + issue_reproduction: true - test: name: Upgrade cluster to latest 8.x ceph version @@ -169,14 +169,14 @@ tests: destroy-cluster: false abort-on-fail: true -# commented until fix merged in Squid: #2326892 -# - test: -# name: "verify fix-obj snap and pool snap deletion" -# module: test_pool_snap.py -# desc: obj snap deletion when pool snapshot is deleted on fixed build -# polarion-id: CEPH-83602685 -# config: -# verify_fix: true +# commented until fix merged in Squid: #2326892 - Issue fixed + - test: + name: "verify fix-obj snap and pool snap deletion" + module: test_pool_snap.py + desc: obj snap deletion when pool snapshot is deleted on fixed build + polarion-id: CEPH-83602685 + config: + verify_fix: true # Running basic rbd and rgw tests after upgrade - test: diff --git a/suites/squid/rados/tier-3_rados_test-location-stretch-mode.yaml b/suites/squid/rados/tier-3_rados_test-location-stretch-mode.yaml index faa1359b2d3..55acb57b9ed 100644 --- a/suites/squid/rados/tier-3_rados_test-location-stretch-mode.yaml +++ b/suites/squid/rados/tier-3_rados_test-location-stretch-mode.yaml @@ -4,7 +4,6 @@ # This test case is Openstack only and cannot be run in Baremetal env due to test constrains. # Stretch mode deployment in BM is run by suite : suites/squid/rados/deploy-stretch-cluster-mode.yaml -# RHOS-d run duration: 300 mins tests: - test: name: Install ceph pre-requisites @@ -332,17 +331,6 @@ tests: desc: Test stretch Cluster netsplit scenario between data sites comments: Active bug - 2318975 - - test: - name: OSD and host replacement - module: test_stretch_osd_serviceability_scenarios.py - polarion-id: CEPH-83575474 - config: - pool_name: test_stretch_pool7 - stretch_bucket: datacenter - tiebreaker_mon_site_name: tiebreaker - delete_pool: true - desc: Test stretch Cluster osd and Host replacement - - test: name: Netsplit Scenarios data-tiebreaker sites module: test_stretch_netsplit_scenarios.py @@ -373,3 +361,15 @@ tests: stretch_bucket: datacenter tiebreaker_mon_site_name: tiebreaker desc: Perform post-deployment negative tests on stretch mode + comments: -ve scenarios bug - 2293147 + + - test: + name: OSD and host replacement + module: test_stretch_osd_serviceability_scenarios.py + polarion-id: CEPH-83575474 + config: + pool_name: test_stretch_pool7 + stretch_bucket: datacenter + tiebreaker_mon_site_name: tiebreaker + delete_pool: true + desc: Test stretch Cluster osd and Host replacement diff --git a/tests/rados/stretch_cluster.py b/tests/rados/stretch_cluster.py index b72a091249a..a03ccef0ca9 100644 --- a/tests/rados/stretch_cluster.py +++ b/tests/rados/stretch_cluster.py @@ -75,13 +75,20 @@ def run(ceph_cluster, **kw): log.info("Verifying forced recovery and healthy in stretch environment") pool_name = "stretch_pool_recovery" - if not rados_obj.create_pool(pool_name=pool_name, pg_num=16): + if not rados_obj.create_pool(pool_name=pool_name): log.error("Failed to create the replicated Pool") return 1 # getting the acting set for the created pool acting_pg_set = rados_obj.get_pg_acting_set(pool_name=pool_name) + # Getting the number of objects, to check if writes were successful later post recovery + pool_stat_init = rados_obj.get_cephdf_stats(pool_name=pool_name) + init_objects = pool_stat_init["stats"]["objects"] + log.debug( + f"initial number of objects on the pool : {pool_name} is {init_objects}" + ) + log.info( f"Killing 2 OSD's from acting set : {acting_pg_set} to verify recovery" ) @@ -91,8 +98,20 @@ def run(ceph_cluster, **kw): log.error(f"Unable to stop the OSD : {osd_id}") return 1 - # Sleeping for 25 seconds ( "osd_heartbeat_grace": "20" ) for osd's to be marked down - time.sleep(25) + rados_obj.change_recovery_threads(config={}, action="set") + + # Guide link : https://docs.ceph.com/en/reef/rados/configuration/mon-osd-interaction/#monitor-settings + log.info( + "Updating the behaviour of the test case. It is expected that PGs will enter peered state when there" + "is partial site failure. below bugzilla has more details on the behaviour " + "Bugzilla : https://bugzilla.redhat.com/show_bug.cgi?id=2328649 " + ) + + log.info( + "'osd_heartbeat_grace': 20 sec, & 'mon_osd_down_out_interval': 10 min " + "Sleeping for 10 minutes for osd's to be marked down & out of the cluster, so that recovery starts" + ) + time.sleep(60 * 11) log.info("Stopped 2 OSD's from acting set, starting to wait for recovery") @@ -100,11 +119,35 @@ def run(ceph_cluster, **kw): log.error("Failed to write objects into the Pool") return 1 + pool_stat_intrim = rados_obj.get_cephdf_stats(pool_name=pool_name) + intrim_objects = pool_stat_intrim["stats"]["objects"] + log.debug( + f"number of objects on the pool with OSD down & out: {pool_name} is {intrim_objects}" + ) + log.debug("Triggering forced recovery in stretch mode") cmd = "ceph osd force_recovery_stretch_mode --yes-i-really-mean-it" rados_obj.run_ceph_command(cmd) log.info("Triggered the recovery in stretch mode") + # Objects should be more than the initial no of objects + if int(intrim_objects) <= int(init_objects): + log.error( + "Write ops should be possible, number of objects in the pool has not changed" + ) + log.info( + f"cluster output dumps:\n" + f"ceph status : \n{rados_obj.run_ceph_command(cmd='ceph -s')}\n" + f"health detail : \n{rados_obj.run_ceph_command(cmd='ceph health detail')}\n" + f"mon dump : \n{rados_obj.run_ceph_command(cmd='ceph mon dump')}\n" + f"ceph report: \n {rados_obj.run_ceph_command(cmd='ceph report')}\n" + f"osd tree : \n{rados_obj.run_ceph_command(cmd='ceph osd df tree')}\n" + ) + raise Exception(f"Pool {pool_name} has {intrim_objects} objs") + log.info( + f"Successfully wrote {int(intrim_objects) - int(init_objects)} on pool {pool_name} in degraded mode\n" + ) + log.debug("Starting the stopped OSD's") for osd_id in stop_osds: if not rados_obj.change_osd_state(action="restart", target=osd_id): @@ -123,6 +166,8 @@ def run(ceph_cluster, **kw): rados_obj.run_ceph_command(cmd) rados_obj.delete_pool(pool=pool_name) + rados_obj.change_recovery_threads(config={}, action="rm") + log.info("Cluster has successfully recovered and is in healthy state") return 0 diff --git a/tests/rados/test_pool_snap.py b/tests/rados/test_pool_snap.py index b1d2fce031b..3f2c77aadef 100644 --- a/tests/rados/test_pool_snap.py +++ b/tests/rados/test_pool_snap.py @@ -21,6 +21,7 @@ def run(ceph_cluster, **kw): - Pacific: BZ-2272361 - Quincy: BZ-2272362 - Reef: BZ-2263169 + - Squid: BZ-2326892 Test to verify object snap deletion when parent pool snapshot is deleted. 1. Create a replicated pool with default config 2. Use rados put to write a single object to the pool diff --git a/tests/rados/test_stretch_netsplit_scenarios.py b/tests/rados/test_stretch_netsplit_scenarios.py index a4c84568a78..87092c71680 100644 --- a/tests/rados/test_stretch_netsplit_scenarios.py +++ b/tests/rados/test_stretch_netsplit_scenarios.py @@ -13,6 +13,7 @@ from ceph.ceph_admin import CephAdmin from ceph.rados.core_workflows import RadosOrchestrator from ceph.rados.pool_workflows import PoolFunctions +from tests.rados.monitor_configurations import MonConfigMethods from tests.rados.test_stretch_site_down import ( get_stretch_site_hosts, post_site_down_checks, @@ -42,17 +43,13 @@ def run(ceph_cluster, **kw): netsplit_site = config.get("netsplit_site", "DC1") tiebreaker_mon_site_name = config.get("tiebreaker_mon_site_name", "tiebreaker") cluster_nodes = ceph_cluster.get_nodes() + mon_obj = MonConfigMethods(rados_obj=rados_obj) + set_debug = config.get("set_debug", False) installer = ceph_cluster.get_nodes(role="installer")[0] init_time, _ = installer.exec_command(cmd="sudo date '+%Y-%m-%d %H:%M:%S'") log.debug(f"Initial time when test was started : {init_time}") try: - # Starting to flush IP table rules on all hosts - for host in cluster_nodes: - log.debug(f"Proceeding to flush iptable rules on host : {host.hostname}") - host.exec_command(sudo=True, cmd="iptables -F", long_running=True) - host.exec_command(sudo=True, cmd="reboot") - time.sleep(20) if not stretch_enabled_checks(rados_obj=rados_obj): log.error( @@ -63,6 +60,15 @@ def run(ceph_cluster, **kw): log.info( f"Starting Netsplit scenario with site : {netsplit_site}. Pre-checks Passed" ) + + if set_debug: + log.debug( + "Setting up debug configs on the cluster for mon, osd & Mgr daemons" + ) + mon_obj.set_config(section="osd", name="debug_osd", value="20/20") + mon_obj.set_config(section="mon", name="debug_mon", value="30/30") + mon_obj.set_config(section="mgr", name="debug_mgr", value="20/20") + osd_tree_cmd = "ceph osd tree" buckets = rados_obj.run_ceph_command(osd_tree_cmd) dc_buckets = [d for d in buckets["nodes"] if d.get("type") == "datacenter"] @@ -178,6 +184,14 @@ def run(ceph_cluster, **kw): f"Stretch Cluster is not marked as degraded even though we have " f"netsplit b/w data sites : {stretch_details}" ) + log.info( + f"cluster output dumps:\n" + f"ceph status : \n{rados_obj.run_ceph_command(cmd='ceph -s')}\n" + f"health detail : \n{rados_obj.run_ceph_command(cmd='ceph health detail')}\n" + f"mon dump : \n{rados_obj.run_ceph_command(cmd='ceph mon dump')}\n" + f"ceph report: \n {rados_obj.run_ceph_command(cmd='ceph report')}\n" + f"osd tree : \n{rados_obj.run_ceph_command(cmd='ceph osd df tree')}\n" + ) raise Exception( "Stretch mode degraded test Failed on the provided cluster" ) @@ -269,8 +283,13 @@ def run(ceph_cluster, **kw): log.error( "Write ops should be possible, number of objects in the pool has not changed" ) - log.debug( - "Test case expected to fail until bug fix : https://bugzilla.redhat.com/show_bug.cgi?id=2265116" + log.info( + f"cluster output dumps:\n" + f"ceph status : \n{rados_obj.run_ceph_command(cmd='ceph -s')}\n" + f"health detail : \n{rados_obj.run_ceph_command(cmd='ceph health detail')}\n" + f"mon dump : \n{rados_obj.run_ceph_command(cmd='ceph mon dump')}\n" + f"ceph report: \n {rados_obj.run_ceph_command(cmd='ceph report')}\n" + f"osd tree : \n{rados_obj.run_ceph_command(cmd='ceph osd df tree')}\n" ) raise Exception( f"Pool {pool_name} has {pool_stat['stats']['objects']} objs" @@ -286,7 +305,7 @@ def run(ceph_cluster, **kw): for host in cluster_nodes: log.debug(f"Proceeding to flush iptable rules on host : {host.hostname}") host.exec_command(sudo=True, cmd="iptables -F", long_running=True) - host.exec_command(sudo=True, cmd="reboot") + host.exec_command(sudo=True, cmd="reboot", check_ec=False) time.sleep(20) log.debug("Sleeping for 30 seconds...") @@ -304,9 +323,6 @@ def run(ceph_cluster, **kw): except Exception as err: log.error(f"Hit an exception: {err}. Test failed") - log.debug( - "Test case expected to fail until bug fix : https://bugzilla.redhat.com/show_bug.cgi?id=2265116" - ) return 1 finally: log.debug("---------------- In Finally Block -------------") @@ -314,14 +330,21 @@ def run(ceph_cluster, **kw): for host in cluster_nodes: log.debug(f"Proceeding to flush iptable rules on host : {host.hostname}") host.exec_command(sudo=True, cmd="iptables -F", long_running=True) - host.exec_command(sudo=True, cmd="reboot") + host.exec_command(sudo=True, cmd="reboot", check_ec=False) time.sleep(20) if config.get("delete_pool"): rados_obj.delete_pool(pool=pool_name) + if set_debug: + log.debug("Removing debug configs on the cluster for mon, osd & Mgr") + mon_obj.remove_config(section="osd", name="debug_osd") + mon_obj.remove_config(section="mon", name="debug_mon") + mon_obj.remove_config(section="mgr", name="debug_mgr") + init_time, _ = installer.exec_command(cmd="sudo date '+%Y-%m-%d %H:%M:%S'") log.debug(f"time when test was Ended : {init_time}") + time.sleep(60) # log cluster health rados_obj.log_cluster_health()