From 73b4a711a606bdd75fde1ef0a3073169d7824673 Mon Sep 17 00:00:00 2001 From: Sunil Angadi Date: Mon, 13 Jan 2025 18:57:57 +0530 Subject: [PATCH] migration across ceph clusters Signed-off-by: Sunil Angadi --- ceph/rbd/utils.py | 46 ++- ceph/rbd/workflows/migration.py | 38 ++- cli/rbd/migration.py | 24 +- .../tier-2_rbd_migration_external_ceph.yaml | 169 ++++++++++ ...est_rbd_migration_external_native_image.py | 311 ++++++++++++++++++ 5 files changed, 583 insertions(+), 5 deletions(-) create mode 100644 suites/squid/rbd/tier-2_rbd_migration_external_ceph.yaml create mode 100644 tests/rbd/test_rbd_migration_external_native_image.py diff --git a/ceph/rbd/utils.py b/ceph/rbd/utils.py index 191d30655..798796b33 100644 --- a/ceph/rbd/utils.py +++ b/ceph/rbd/utils.py @@ -35,16 +35,19 @@ def value(key, dictionary): return str(list(find(key, dictionary))[0]) -def copy_file(file_name, src, dest): +def copy_file(file_name, src, dest, dest_file_name=None): """Copies the given file from src node to dest node Args: file_name: Full path of the file to be copied src: Source CephNode object dest: Destination CephNode object + dest_file_name: Destination file name """ contents, err = src.exec_command(sudo=True, cmd="cat {}".format(file_name)) - key_file = dest.remote_file(sudo=True, file_name=file_name, file_mode="w") + key_file = dest.remote_file( + sudo=True, file_name=dest_file_name or file_name, file_mode="w" + ) key_file.write(contents) key_file.flush() @@ -201,3 +204,42 @@ def convert_size(size_bytes): p = math.pow(1024, i) s = round(size_bytes / p) return "%s%s" % (s, size_name[i]) + + +def configure_common_client_node(client1, client2): + """ + Configure the common client node as client1 to access both clusters. + Args: + client1: Cluster1 client node object + client2: Cluster2 client node object + """ + + # Ensure /etc/ceph directory exists and is writable on client1 + client1.exec_command(cmd="sudo mkdir -p /etc/ceph && sudo chmod 777 /etc/ceph") + + # Copy cluster2 configuration and keyring files to client1 + cluster2_files = [ + ("/etc/ceph/ceph.conf", "/etc/ceph/cluster2.conf"), + ( + "/etc/ceph/ceph.client.admin.keyring", + "/etc/ceph/cluster2.client.admin.keyring", + ), + ] + for file, dest_path in cluster2_files: + copy_file(file_name=file, src=client2, dest=client1, dest_file_name=dest_path) + + client1.exec_command(sudo=True, cmd="chmod 644 /etc/ceph/*") + + # verify cluster accessibility for both clusters + for cluster_name in ["ceph", "cluster2"]: + out, err = client1.exec_command( + cmd=f"ceph -s --cluster {cluster_name}", output=True + ) + log.info(f"Cluster {cluster_name} status: {out}") + if err: + raise Exception( + f"Unable to access cluster {cluster_name} from common client node" + ) + return 1 + log.info("Common client node configured successfully.") + return 0 diff --git a/ceph/rbd/workflows/migration.py b/ceph/rbd/workflows/migration.py index fc1fb4216..c2c388e80 100644 --- a/ceph/rbd/workflows/migration.py +++ b/ceph/rbd/workflows/migration.py @@ -1,4 +1,5 @@ import json +import tempfile from cli.rbd.rbd import Rbd from utility.log import Log @@ -6,7 +7,7 @@ log = Log(__name__) -def verify_migration_state(action, image_spec, **kw): +def verify_migration_state(action, image_spec, cluster_name="ceph", **kw): """verify the migration status at each action. This method will verify the migration state for an image for @@ -24,7 +25,11 @@ def verify_migration_state(action, image_spec, **kw): """ rbd = Rbd(kw["client"]) log.info("verifying migration state") - status_config = {"image-spec": image_spec, "format": "json"} + status_config = { + "image-spec": image_spec, + "cluster": cluster_name, + "format": "json", + } out, err = rbd.status(**status_config) log.info(out) status = json.loads(out) @@ -39,3 +44,32 @@ def verify_migration_state(action, image_spec, **kw): except Exception as error: log.error(error) return 1 + + +def prepare_migration_source_spec( + cluster_name, client, pool_name, image_name, snap_name +): + """ + Create a native source spec file for migration. + Args: + cluster_name: Name of the source cluster + pool_name: Name of the source pool + image_name: Name of the source image + snap_name: Name of the snapshot + Returns: + Path to the native spec file + """ + native_spec = { + "cluster_name": cluster_name, + "type": "native", + "pool_name": pool_name, + "image_name": image_name, + "snap_name": snap_name, + } + + temp_file = tempfile.NamedTemporaryFile(dir="/tmp", suffix=".json") + spec_file = client.remote_file(sudo=True, file_name=temp_file.name, file_mode="w") + spec_file.write(json.dumps(native_spec, indent=4)) + spec_file.flush() + + return temp_file.name diff --git a/cli/rbd/migration.py b/cli/rbd/migration.py index 28ac6aa06..273921b90 100644 --- a/cli/rbd/migration.py +++ b/cli/rbd/migration.py @@ -44,6 +44,28 @@ def action(self, **kw): """ action = kw.get("action", None) dest_spec = kw.get("dest_spec", None) + cluster_name = kw.get("cluster_name", "ceph") log.info(f"Starting the {action} migration process") - cmd = f"{self.base_cmd} {action} {dest_spec}" + cmd = f"{self.base_cmd} {action} {dest_spec} --cluster {cluster_name}" + return self.execute_as_sudo(cmd=cmd, long_running=True) + + def prepare_import(self, **kw): + """ + Prepare the live migration of image from one ceph cluster to another, + Args: + kw(dict): Key/value pairs that needs to be provided to the client node. + Example:: + Supported keys: + source_spec_path : json formatted string for streamed imports + dest_spec: Target image spec TARGET_POOL_NAME/TARGET_IMAGE_NAME + + """ + log.info("Starting prepare Live migration of image to external ceph cluster") + source_spec_path = kw.get("source_spec_path", None) + dest_spec = kw.get("dest_spec", None) + cluster_name = kw.get("cluster_name", None) + cmd = ( + f"{self.base_cmd} prepare --import-only --source-spec-path {source_spec_path} " + f"{dest_spec} --cluster {cluster_name}" + ) return self.execute_as_sudo(cmd=cmd, long_running=True) diff --git a/suites/squid/rbd/tier-2_rbd_migration_external_ceph.yaml b/suites/squid/rbd/tier-2_rbd_migration_external_ceph.yaml new file mode 100644 index 000000000..d77e749fe --- /dev/null +++ b/suites/squid/rbd/tier-2_rbd_migration_external_ceph.yaml @@ -0,0 +1,169 @@ +#=============================================================================================== +# Tier-level: 2 +# Test-Suite: tier-2_rbd_migration_external_ceph.yaml +# +# Cluster Configuration: +# cephci/conf/squid/rbd/5-node-2-clusters.yaml +# No of Clusters : 2 +# Each cluster configuration +# 5-Node cluster(RHEL-8.3 and above) +# 3 MONS, 2 MGR, 3 OSD, 1 Client +# Node1 - Mon, Mgr, Installer +# Node2 - client +# Node3 - OSD, MON, MGR +# Node4 - OSD, MON +# Node5 - OSD, +#=============================================================================================== +tests: + - test: + name: setup install pre-requisistes + desc: Setup phase to deploy the required pre-requisites for running the tests. + module: install_prereq.py + abort-on-fail: true + + - test: + abort-on-fail: true + clusters: + ceph-rbd1: + config: + verify_cluster_health: true + steps: + - config: + command: bootstrap + service: cephadm + args: + mon-ip: node1 + orphan-initial-daemons: true + skip-monitoring-stack: true + - config: + command: add_hosts + service: host + args: + attach_ip_address: true + labels: apply-all-labels + - config: + command: apply + service: mgr + args: + placement: + label: mgr + - config: + command: apply + service: mon + args: + placement: + label: mon + - config: + command: apply + service: osd + args: + all-available-devices: true + ceph-rbd2: + config: + verify_cluster_health: true + steps: + - config: + command: bootstrap + service: cephadm + args: + mon-ip: node1 + orphan-initial-daemons: true + skip-monitoring-stack: true + - config: + command: add_hosts + service: host + args: + attach_ip_address: true + labels: apply-all-labels + - config: + command: apply + service: mgr + args: + placement: + label: mgr + - config: + command: apply + service: mon + args: + placement: + label: mon + - config: + command: apply + service: osd + args: + all-available-devices: true + desc: Two ceph cluster deployment for external ceph migration testing + destroy-clster: false + module: test_cephadm.py + name: deploy two ceph cluster + + - test: + abort-on-fail: true + clusters: + ceph-rbd1: + config: + command: add + id: client.1 + node: node2 + install_packages: + - ceph-common + - fio + copy_admin_keyring: true + ceph-rbd2: + config: + command: add + id: client.1 + node: node2 + install_packages: + - ceph-common + - fio + copy_admin_keyring: true + desc: Configure the client node for both the clusters + destroy-cluster: false + module: test_client.py + name: configure client + + - test: + desc: Enable mon_allow_pool_delete to True for deleting the pools + module: exec.py + name: configure mon_allow_pool_delete to True + abort-on-fail: true + config: + cephadm: true + commands: + - "ceph config set mon mon_allow_pool_delete true" + + - test: + desc: Install rbd-nbd and remove any epel packages + module: exec.py + name: Install rbd-nbd + config: + sudo: true + commands: + - "rm -rf /etc/yum.repos.d/epel*" + - "dnf install rbd-nbd -y" + + - test: + name: Test image migration with external ceph cluster + desc: live migration with external ceph native data format + module: test_rbd_migration_external_native_image.py + clusters: + ceph-rbd1: + config: + rep_pool_config: + num_pools: 1 + num_images: 1 + size: 4G + create_pool_parallely: true + create_image_parallely: true + test_ops_parallely: true + io_size: 1G + ec_pool_config: + num_pools: 1 + num_images: 1 + size: 4G + create_pool_parallely: true + create_image_parallely: true + test_ops_parallely: true + io_size: 1G + polarion-id: CEPH-83597689 diff --git a/tests/rbd/test_rbd_migration_external_native_image.py b/tests/rbd/test_rbd_migration_external_native_image.py new file mode 100644 index 000000000..8139f1a4c --- /dev/null +++ b/tests/rbd/test_rbd_migration_external_native_image.py @@ -0,0 +1,311 @@ +"""Module to verify successful live migration of RBD images from +one ceph cluster to another ceph cluster with native data format. + +Test case covered - +CEPH-83597689 - Live migration of Native RBD images +from one ceph cluster to another ceph cluster + +Pre-requisites: +- Two Ceph clusters deployed and accessible. +- A common client node configured to access both clusters. +- `ceph-common` package installed with live migration binaries available. + +Test Case Flow: +1. Deploy two ceph clusters as source(cluster1) and destination(cluster2) +along with mon,mgr,osd’s +2. Install ceph-common package on one common client node +3.Create one client node common among both clusters by copying the both +the ceph.conf and ceph.client.admin.keyring from both the clusters +into the common client node. +4. Create one Replicated pool on both clusters and initialize it. +5. Create one RBD image inside that pool +6. Write some data to image using rbd bench or fio or file mount +also note down it's md5sum for data consistency +7. Create one snapshots to that image using rbd snap create command +8. Create a native source spec file for migration +9. Execute prepare migration with import-only option with +--source-spec-path for RBD image migration +10. Initiate migration execution using the migration execute command +11. Commit the Migration using the migration commit command +12. Export the migrated image using rbd export command and +note down it's md5sum checksum +13. Verify md5sum checksum of before migration and after +migration should be same +14. Repeat the above test for EC pool +15. unmount, unmap, cleanup the pools and images +""" + +from copy import deepcopy + +from ceph.rbd.initial_config import initial_rbd_config +from ceph.rbd.utils import ( + configure_common_client_node, + get_md5sum_rbd_image, + getdict, + random_string, +) +from ceph.rbd.workflows.cleanup import cleanup +from ceph.rbd.workflows.migration import ( + prepare_migration_source_spec, + verify_migration_state, +) +from ceph.rbd.workflows.rbd import ( + create_single_pool_and_images, + run_io_and_check_rbd_status, +) +from cli.rbd.rbd import Rbd +from utility.log import Log + +log = Log(__name__) + + +def test_external_rbd_image_migration(rbd_obj, c1_client, c2_client, **kw): + """ + Test to perform live migration of images with native data format. + Args: + rbd_obj: rbd object + c1_client: Cluster1 client node object + c2_client: Cluster2 client node object + kw: Key/value pairs of configuration information to be used in the test + """ + c1 = "ceph" + c2 = "cluster2" + snap_name = "snap1" + rbd2 = Rbd(c2_client) + + for pool_type in rbd_obj.get("pool_types"): + rbd_config = kw.get("config", {}).get(pool_type, {}) + multi_pool_config = deepcopy(getdict(rbd_config)) + rbd = rbd_obj.get("rbd") + for pool, pool_config in multi_pool_config.items(): + if "data_pool" in pool_config.keys(): + _ = pool_config.pop("data_pool") + multi_image_config = getdict(pool_config) + for image_name, image_conf in multi_image_config.items(): + # Run IO on the image + try: + io_rc = run_io_and_check_rbd_status( + rbd=rbd, + pool=pool, + image=image_name, + client=c1_client, + image_conf=image_conf, + ) + if io_rc: + log.error(f"IO on image {image_name} failed") + return 1 + + # create snapshot for the image + rbd.snap.create( + pool=pool, + image=image_name, + snap=snap_name, + ) + + # get md5sum of image before migration for data consistency check + md5_sum_before_migration = get_md5sum_rbd_image( + image_spec=f"{pool}/{image_name}", + rbd=rbd, + client=c1_client, + file_path=f"/tmp/{random_string(len=3)}", + ) + log.info(f"md5sum before Migration: {md5_sum_before_migration}") + + # prepare migration source spec + source_spec_path = prepare_migration_source_spec( + cluster_name=c1, + client=c1_client, + pool_name=pool, + image_name=image_name, + snap_name=snap_name, + ) + + # Create a target pool where image needs to be migrated on cluster2 + is_ec_pool = True if "ec" in pool_type else False + config = kw.get("config", {}) + target_pool = "target_pool_" + random_string(len=5) + target_pool_config = {} + if is_ec_pool: + data_pool_target = "data_pool_new_" + random_string(len=5) + target_pool_config["data_pool"] = data_pool_target + + rc = create_single_pool_and_images( + config=config, + pool=target_pool, + pool_config=target_pool_config, + client=c2_client, + cluster="ceph", + rbd=rbd2, + ceph_version=int(config.get("rhbuild")[0]), + is_ec_pool=is_ec_pool, + is_secondary=False, + do_not_create_image=True, + ) + if rc: + log.error(f"Creation of target pool {target_pool} failed") + return rc + + # Exceute prepare migration for external cluster + target_image = "target_image_" + random_string(len=5) + rbd.migration.prepare_import( + source_spec_path=source_spec_path, + dest_spec=f"{target_pool}/{target_image}", + cluster_name=c2, + ) + + # verify prepare migration status + if verify_migration_state( + action="prepare", + image_spec=f"{target_pool}/{target_image}", + cluster_name=c2, + client=c1_client, + **kw, + ): + log.error("Failed to prepare migration") + return 1 + + # execute migration from cluster2 + rbd.migration.action( + action="execute", + dest_spec=f"{target_pool}/{target_image}", + cluster_name=c2, + ) + + # verify execute migration status + if verify_migration_state( + action="execute", + image_spec=f"{target_pool}/{target_image}", + cluster_name=c2, + client=c1_client, + **kw, + ): + log.error("Failed to execute migration") + return 1 + + # commit migration for external cluster + rbd.migration.action( + action="commit", + dest_spec=f"{target_pool}/{target_image}", + cluster_name=c2, + ) + + # verify commit migration status + if verify_migration_state( + action="commit", + image_spec=f"{target_pool}/{target_image}", + cluster_name=c2, + client=c1_client, + **kw, + ): + log.error("Failed to commit migration") + return 1 + + # verify checksum post migration + md5_sum_after_migration = get_md5sum_rbd_image( + image_spec=f"{target_pool}/{target_image}", + rbd=rbd2, + client=c2_client, + file_path=f"/tmp/{random_string(len=5)}", + ) + log.info(f"md5sum after migration: {md5_sum_after_migration}") + + if md5_sum_before_migration != md5_sum_after_migration: + log.error( + "Data integrity check failed, md5sum checksums are not same" + ) + return 1 + log.info("md5sum checksum is same on both clusters after migration") + + except Exception as e: + log.error(f"Error during migration: {str(e)}") + return 1 + + finally: + if source_spec_path: + log.info(f"Cleaning up source spec path: {source_spec_path}") + out, err = c1_client.exec_command( + sudo=True, cmd=f"rm -f {source_spec_path}" + ) + if err: + log.error(f"Failed to delete file {source_spec_path}") + if target_pool: + log.info(f"Removing target pool: {target_pool}") + out, err = c2_client.exec_command( + sudo=True, + cmd=f"ceph osd pool delete {target_pool} {target_pool} --yes-i-really-really-mean-it", + ) + if err: + log.error(f"Failed to delete the pool {target_pool}") + return 0 + + +def run(**kw): + """ + Test to execute Live image migration with native data format + from external ceph cluster. + Args: + kw: Key/value pairs of configuration information to be used in the test + Example:: + config: + do_not_create_image: True + rep_pool_config: + num_pools: 2 + size: 4G + ec_pool_config: + num_pools: 2 + size: 4G + create_pool_parallely: true + """ + log.info( + "Executing CEPH-83597689: Live migration of images with native \ + data format from external ceph cluster" + ) + + try: + rbd_obj = initial_rbd_config(**kw) + pool_types = rbd_obj.get("pool_types") + + cluster1_client = ( + kw.get("ceph_cluster_dict").get("ceph-rbd1").get_nodes(role="client")[0] + ) + + cluster2_client = ( + kw.get("ceph_cluster_dict").get("ceph-rbd2").get_nodes(role="client")[0] + ) + + if configure_common_client_node( + client1=cluster1_client, + client2=cluster2_client, + ): + log.error("Common client node configuration failed") + return 1 + + if rbd_obj: + log.info("Executing test on Replicated and EC pool") + + ret_val = test_external_rbd_image_migration( + rbd_obj=rbd_obj, + c1_client=cluster1_client, + c2_client=cluster2_client, + **kw, + ) + + if ret_val == 0: + log.info( + "Testing RBD image migration with external ceph native format Passed" + ) + + except Exception as e: + log.error( + f"RBD image migration with external ceph native format failed with the error {str(e)}" + ) + ret_val = 1 + + finally: + cluster_name = kw.get("ceph_cluster", {}).name + if "rbd_obj" not in locals(): + rbd_obj = Rbd(cluster1_client) + obj = {cluster_name: rbd_obj} + cleanup(pool_types=pool_types, multi_cluster_obj=obj, **kw) + + return ret_val