Skip to content

Commit

Permalink
stretch mode: Fixing TFA issues identified in stretch mode execution
Browse files Browse the repository at this point in the history
Signed-off-by: Pawan Dhiran <[email protected]>
  • Loading branch information
pdhiran committed Jan 13, 2025
1 parent aa2b0d5 commit aa7b6cb
Show file tree
Hide file tree
Showing 8 changed files with 150 additions and 77 deletions.
23 changes: 12 additions & 11 deletions suites/pacific/rados/tier-3_rados_test-location-stretch-mode.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -319,17 +319,6 @@ tests:
delete_pool: true
desc: Test stretch Cluster netsplit scenario between data sites

- test:
name: OSD and host replacement
module: test_stretch_osd_serviceability_scenarios.py
polarion-id: CEPH-83575474
config:
pool_name: test_stretch_pool7
stretch_bucket: datacenter
tiebreaker_mon_site_name: tiebreaker
delete_pool: true
desc: Test stretch Cluster osd and Host replacement

- test:
name: Netsplit Scenarios data-tiebreaker sites
module: test_stretch_netsplit_scenarios.py
Expand Down Expand Up @@ -360,3 +349,15 @@ tests:
stretch_bucket: datacenter
tiebreaker_mon_site_name: tiebreaker
desc: Perform post-deployment negative tests on stretch mode
comments: -ve scenarios bug - 2293147

- test:
name: OSD and host replacement
module: test_stretch_osd_serviceability_scenarios.py
polarion-id: CEPH-83575474
config:
pool_name: test_stretch_pool7
stretch_bucket: datacenter
tiebreaker_mon_site_name: tiebreaker
delete_pool: true
desc: Test stretch Cluster osd and Host replacement
24 changes: 13 additions & 11 deletions suites/quincy/rados/tier-3_rados_test-location-stretch-mode.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -319,17 +319,6 @@ tests:
delete_pool: true
desc: Test stretch Cluster netsplit scenario between data sites

- test:
name: OSD and host replacement
module: test_stretch_osd_serviceability_scenarios.py
polarion-id: CEPH-83575474
config:
pool_name: test_stretch_pool7
stretch_bucket: datacenter
tiebreaker_mon_site_name: tiebreaker
delete_pool: true
desc: Test stretch Cluster osd and Host replacement

- test:
name: Netsplit Scenarios data-tiebreaker sites
module: test_stretch_netsplit_scenarios.py
Expand All @@ -352,6 +341,7 @@ tests:
delete_pool: true
desc: Test the cluster when the Data site is moved to maintenance mode


- test:
name: Negative scenarios - post-deployment
module: test_stretch_negative_scenarios.py
Expand All @@ -360,3 +350,15 @@ tests:
stretch_bucket: datacenter
tiebreaker_mon_site_name: tiebreaker
desc: Perform post-deployment negative tests on stretch mode
comments: -ve scenarios bug - 2293147

- test:
name: OSD and host replacement
module: test_stretch_osd_serviceability_scenarios.py
polarion-id: CEPH-83575474
config:
pool_name: test_stretch_pool7
stretch_bucket: datacenter
tiebreaker_mon_site_name: tiebreaker
delete_pool: true
desc: Test stretch Cluster osd and Host replacement
23 changes: 12 additions & 11 deletions suites/reef/rados/tier-3_rados_test-location-stretch-mode.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -333,17 +333,6 @@ tests:
desc: Test stretch Cluster netsplit scenario between data sites
comments: Active bug - 2318975

- test:
name: OSD and host replacement
module: test_stretch_osd_serviceability_scenarios.py
polarion-id: CEPH-83575474
config:
pool_name: test_stretch_pool7
stretch_bucket: datacenter
tiebreaker_mon_site_name: tiebreaker
delete_pool: true
desc: Test stretch Cluster osd and Host replacement

- test:
name: Netsplit Scenarios data-tiebreaker sites
module: test_stretch_netsplit_scenarios.py
Expand Down Expand Up @@ -374,3 +363,15 @@ tests:
stretch_bucket: datacenter
tiebreaker_mon_site_name: tiebreaker
desc: Perform post-deployment negative tests on stretch mode
comments: -ve scenarios bug - 2293147

- test:
name: OSD and host replacement
module: test_stretch_osd_serviceability_scenarios.py
polarion-id: CEPH-83575474
config:
pool_name: test_stretch_pool7
stretch_bucket: datacenter
tiebreaker_mon_site_name: tiebreaker
delete_pool: true
desc: Test stretch Cluster osd and Host replacement
32 changes: 16 additions & 16 deletions suites/squid/rados/tier-2_rados_test-brownfield.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -146,14 +146,14 @@ tests:
polarion-id: CEPH-83574439
abort-on-fail: false

# commented until fix merged in Squid: #2326892
# - test:
# name: "issue repro-obj snap and pool snap deletion"
# module: test_pool_snap.py
# desc: obj snap deletion when pool snapshot is deleted on unfixed build
# polarion-id: CEPH-83602685
# config:
# issue_reproduction: true
# commented until fix merged in Squid: #2326892 - Issue fixed
- test:
name: "issue repro-obj snap and pool snap deletion"
module: test_pool_snap.py
desc: obj snap deletion when pool snapshot is deleted on unfixed build
polarion-id: CEPH-83602685
config:
issue_reproduction: true

- test:
name: Upgrade cluster to latest 8.x ceph version
Expand All @@ -169,14 +169,14 @@ tests:
destroy-cluster: false
abort-on-fail: true

# commented until fix merged in Squid: #2326892
# - test:
# name: "verify fix-obj snap and pool snap deletion"
# module: test_pool_snap.py
# desc: obj snap deletion when pool snapshot is deleted on fixed build
# polarion-id: CEPH-83602685
# config:
# verify_fix: true
# commented until fix merged in Squid: #2326892 - Issue fixed
- test:
name: "verify fix-obj snap and pool snap deletion"
module: test_pool_snap.py
desc: obj snap deletion when pool snapshot is deleted on fixed build
polarion-id: CEPH-83602685
config:
verify_fix: true

# Running basic rbd and rgw tests after upgrade
- test:
Expand Down
24 changes: 12 additions & 12 deletions suites/squid/rados/tier-3_rados_test-location-stretch-mode.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
# This test case is Openstack only and cannot be run in Baremetal env due to test constrains.
# Stretch mode deployment in BM is run by suite : suites/squid/rados/deploy-stretch-cluster-mode.yaml

# RHOS-d run duration: 300 mins
tests:
- test:
name: Install ceph pre-requisites
Expand Down Expand Up @@ -332,17 +331,6 @@ tests:
desc: Test stretch Cluster netsplit scenario between data sites
comments: Active bug - 2318975

- test:
name: OSD and host replacement
module: test_stretch_osd_serviceability_scenarios.py
polarion-id: CEPH-83575474
config:
pool_name: test_stretch_pool7
stretch_bucket: datacenter
tiebreaker_mon_site_name: tiebreaker
delete_pool: true
desc: Test stretch Cluster osd and Host replacement

- test:
name: Netsplit Scenarios data-tiebreaker sites
module: test_stretch_netsplit_scenarios.py
Expand Down Expand Up @@ -373,3 +361,15 @@ tests:
stretch_bucket: datacenter
tiebreaker_mon_site_name: tiebreaker
desc: Perform post-deployment negative tests on stretch mode
comments: -ve scenarios bug - 2293147

- test:
name: OSD and host replacement
module: test_stretch_osd_serviceability_scenarios.py
polarion-id: CEPH-83575474
config:
pool_name: test_stretch_pool7
stretch_bucket: datacenter
tiebreaker_mon_site_name: tiebreaker
delete_pool: true
desc: Test stretch Cluster osd and Host replacement
51 changes: 48 additions & 3 deletions tests/rados/stretch_cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,13 +75,20 @@ def run(ceph_cluster, **kw):
log.info("Verifying forced recovery and healthy in stretch environment")

pool_name = "stretch_pool_recovery"
if not rados_obj.create_pool(pool_name=pool_name, pg_num=16):
if not rados_obj.create_pool(pool_name=pool_name):
log.error("Failed to create the replicated Pool")
return 1

# getting the acting set for the created pool
acting_pg_set = rados_obj.get_pg_acting_set(pool_name=pool_name)

# Getting the number of objects, to check if writes were successful later post recovery
pool_stat_init = rados_obj.get_cephdf_stats(pool_name=pool_name)
init_objects = pool_stat_init["stats"]["objects"]
log.debug(
f"initial number of objects on the pool : {pool_name} is {init_objects}"
)

log.info(
f"Killing 2 OSD's from acting set : {acting_pg_set} to verify recovery"
)
Expand All @@ -91,20 +98,56 @@ def run(ceph_cluster, **kw):
log.error(f"Unable to stop the OSD : {osd_id}")
return 1

# Sleeping for 25 seconds ( "osd_heartbeat_grace": "20" ) for osd's to be marked down
time.sleep(25)
rados_obj.change_recovery_threads(config={}, action="set")

# Guide link : https://docs.ceph.com/en/reef/rados/configuration/mon-osd-interaction/#monitor-settings
log.info(
"Updating the behaviour of the test case. It is expected that PGs will enter peered state when there"
"is partial site failure. below bugzilla has more details on the behaviour "
"Bugzilla : https://bugzilla.redhat.com/show_bug.cgi?id=2328649 "
)

log.info(
"'osd_heartbeat_grace': 20 sec, & 'mon_osd_down_out_interval': 10 min "
"Sleeping for 10 minutes for osd's to be marked down & out of the cluster, so that recovery starts"
)
time.sleep(60 * 11)

log.info("Stopped 2 OSD's from acting set, starting to wait for recovery")

if not rados_obj.bench_write(pool_name=pool_name, **config):
log.error("Failed to write objects into the Pool")
return 1

pool_stat_intrim = rados_obj.get_cephdf_stats(pool_name=pool_name)
intrim_objects = pool_stat_intrim["stats"]["objects"]
log.debug(
f"number of objects on the pool with OSD down & out: {pool_name} is {intrim_objects}"
)

log.debug("Triggering forced recovery in stretch mode")
cmd = "ceph osd force_recovery_stretch_mode --yes-i-really-mean-it"
rados_obj.run_ceph_command(cmd)
log.info("Triggered the recovery in stretch mode")

# Objects should be more than the initial no of objects
if int(intrim_objects) <= int(init_objects):
log.error(
"Write ops should be possible, number of objects in the pool has not changed"
)
log.info(
f"cluster output dumps:\n"
f"ceph status : \n{rados_obj.run_ceph_command(cmd='ceph -s')}\n"
f"health detail : \n{rados_obj.run_ceph_command(cmd='ceph health detail')}\n"
f"mon dump : \n{rados_obj.run_ceph_command(cmd='ceph mon dump')}\n"
f"ceph report: \n {rados_obj.run_ceph_command(cmd='ceph report')}\n"
f"osd tree : \n{rados_obj.run_ceph_command(cmd='ceph osd df tree')}\n"
)
raise Exception(f"Pool {pool_name} has {intrim_objects} objs")
log.info(
f"Successfully wrote {int(intrim_objects) - int(init_objects)} on pool {pool_name} in degraded mode\n"
)

log.debug("Starting the stopped OSD's")
for osd_id in stop_osds:
if not rados_obj.change_osd_state(action="restart", target=osd_id):
Expand All @@ -123,6 +166,8 @@ def run(ceph_cluster, **kw):
rados_obj.run_ceph_command(cmd)
rados_obj.delete_pool(pool=pool_name)

rados_obj.change_recovery_threads(config={}, action="rm")

log.info("Cluster has successfully recovered and is in healthy state")
return 0

Expand Down
1 change: 1 addition & 0 deletions tests/rados/test_pool_snap.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ def run(ceph_cluster, **kw):
- Pacific: BZ-2272361
- Quincy: BZ-2272362
- Reef: BZ-2263169
- Squid: BZ-2326892
Test to verify object snap deletion when parent pool snapshot is deleted.
1. Create a replicated pool with default config
2. Use rados put to write a single object to the pool
Expand Down
Loading

0 comments on commit aa7b6cb

Please sign in to comment.