Skip to content

Commit

Permalink
Merge pull request #1228 from liangxin1300/20230803_stop_cluster_all_…
Browse files Browse the repository at this point in the history
…crmsh45

[crmsh-4.5] Fix: ui_cluster: Improve the process of 'crm cluster stop' (bsc#1213889)
  • Loading branch information
liangxin1300 authored Dec 12, 2023
2 parents 8b3c138 + 1fcb08c commit 89bb8cc
Show file tree
Hide file tree
Showing 5 changed files with 191 additions and 91 deletions.
81 changes: 57 additions & 24 deletions crmsh/ui_cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -190,40 +190,73 @@ def do_start(self, context, *args):
for node in node_list:
logger.info("The cluster stack started on {}".format(node))

@staticmethod
def _node_ready_to_stop_cluster_service(node):
"""
Check if the specific node is ready to stop cluster service
If both corosync.service and pacemaker.service is active, return True
If some services started, stop them first and return False
"""
corosync_active = utils.service_is_active("corosync.service", remote_addr=node)
sbd_active = utils.service_is_active("sbd.service", remote_addr=node)
pacemaker_active = utils.service_is_active("pacemaker.service", remote_addr=node)

if not corosync_active:
if sbd_active:
utils.stop_service("corosync", remote_addr=node)
logger.info(f"The cluster stack stopped on {node}")
else:
logger.info(f"The cluster stack already stopped on {node}")
return False

elif not pacemaker_active:
utils.stop_service("corosync", remote_addr=node)
logger.info("The cluster stack stopped on {}".format(node))
return False

return True

@staticmethod
def _wait_for_dc(node=None):
"""
Wait for the cluster's DC to become available
"""
if not utils.service_is_active("pacemaker.service", remote_addr=node):
return

dc_deadtime = utils.get_property("dc-deadtime", peer=node) or str(constants.DC_DEADTIME_DEFAULT)
dc_timeout = int(dc_deadtime.strip('s')) + 5
try:
utils.check_function_with_timeout(utils.get_dc, wait_timeout=dc_timeout, peer=node)
except TimeoutError:
logger.error("No DC found currently, please wait if the cluster is still starting")
raise utils.TerminateSubCommand

@staticmethod
def _set_dlm(node=None):
"""
When dlm running and quorum is lost, before stop cluster service, should set
enable_quorum_fencing=0, enable_quorum_lockspace=0 for dlm config option
"""
if utils.is_dlm_running(node) and not utils.is_quorate(node):
logger.debug("Quorum is lost; Set enable_quorum_fencing=0 and enable_quorum_lockspace=0 for dlm")
utils.set_dlm_option(peer=node, enable_quorum_fencing=0, enable_quorum_lockspace=0)

@command.skill_level('administrator')
def do_stop(self, context, *args):
'''
Stops the cluster stack on all nodes or specific node(s)
'''
node_list = parse_option_for_nodes(context, *args)
for node in node_list[:]:
if not utils.service_is_active("corosync.service", remote_addr=node):
if utils.service_is_active("sbd.service", remote_addr=node):
utils.stop_service("corosync", remote_addr=node)
logger.info("The cluster stack stopped on {}".format(node))
else:
logger.info("The cluster stack already stopped on {}".format(node))
node_list.remove(node)
elif not utils.service_is_active("pacemaker.service", remote_addr=node):
utils.stop_service("corosync", remote_addr=node)
logger.info("The cluster stack stopped on {}".format(node))
node_list.remove(node)
node_list = [n for n in node_list if self._node_ready_to_stop_cluster_service(n)]
if not node_list:
return
logger.debug(f"stop node list: {node_list}")

dc_deadtime = utils.get_property("dc-deadtime") or str(constants.DC_DEADTIME_DEFAULT)
dc_timeout = int(dc_deadtime.strip('s')) + 5
try:
utils.check_function_with_timeout(utils.get_dc, wait_timeout=dc_timeout)
except TimeoutError:
logger.error("No DC found currently, please wait if the cluster is still starting")
return False
self._wait_for_dc(node_list[0])

# When dlm running and quorum is lost, before stop cluster service, should set
# enable_quorum_fencing=0, enable_quorum_lockspace=0 for dlm config option
if utils.is_dlm_running() and not utils.is_quorate():
logger.debug("Quorum is lost; Set enable_quorum_fencing=0 and enable_quorum_lockspace=0 for dlm")
utils.set_dlm_option(enable_quorum_fencing=0, enable_quorum_lockspace=0)
self._set_dlm(node_list[0])

# Stop pacemaker since it can make sure cluster has quorum until stop corosync
node_list = utils.stop_service("pacemaker", node_list=node_list)
Expand Down
62 changes: 38 additions & 24 deletions crmsh/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -1142,14 +1142,14 @@ def append_file(dest, src):
return False


def get_dc():
def get_dc(peer=None):
cmd = "crmadmin -D -t 1"
rc, s, _ = get_stdout_stderr(add_sudo(cmd))
if rc != 0:
out = get_stdout_or_raise_error(add_sudo(cmd), remote=peer, no_raise=True)
if not out:
return None
if not s.startswith("Designated"):
if not out.startswith("Designated"):
return None
return s.split()[-1]
return out.split()[-1]


def wait4dc(what="", show_progress=True):
Expand Down Expand Up @@ -3204,47 +3204,62 @@ def is_standby(node):
return re.search(r'Node\s+{}:\s+standby'.format(node), out) is not None


def get_dlm_option_dict():
def get_dlm_option_dict(peer=None):
"""
Get dlm config option dictionary
"""
out = get_stdout_or_raise_error("dlm_tool dump_config")
out = get_stdout_or_raise_error("dlm_tool dump_config", remote=peer)
return dict(re.findall("(\w+)=(\w+)", out))


def set_dlm_option(**kargs):
def set_dlm_option(peer=None, **kargs):
"""
Set dlm option
"""
dlm_option_dict = get_dlm_option_dict()
dlm_option_dict = get_dlm_option_dict(peer=peer)
for option, value in kargs.items():
if option not in dlm_option_dict:
raise ValueError('"{}" is not dlm config option'.format(option))
raise ValueError(f'"{option}" is not dlm config option')
if dlm_option_dict[option] != value:
get_stdout_or_raise_error('dlm_tool set_config "{}={}"'.format(option, value))
get_stdout_or_raise_error(f'dlm_tool set_config "{option}={value}"', remote=peer)


def is_dlm_running():
def is_dlm_running(peer=None):
"""
Check if dlm ra controld is running
"""
from . import xmlutil
return xmlutil.CrmMonXmlParser.is_resource_started(constants.DLM_CONTROLD_RA)
return is_resource_running(constants.DLM_CONTROLD_RA, peer=peer)


def has_resource_configured(ra_type, peer=None):
"""
Check if the RA configured
"""
out = get_stdout_or_raise_error("crm_mon -1rR", remote=peer)
return re.search(ra_type, out) is not None


def is_dlm_configured():
def is_resource_running(ra_type, peer=None):
"""
Check if the RA running
"""
out = get_stdout_or_raise_error("crm_mon -1rR", remote=peer)
patt = f"\({ra_type}\):\s*Started"
return re.search(patt, out) is not None


def is_dlm_configured(peer=None):
"""
Check if dlm configured
"""
from . import xmlutil
return xmlutil.CrmMonXmlParser.is_resource_configured(constants.DLM_CONTROLD_RA)
return has_resource_configured(constants.DLM_CONTROLD_RA, peer=peer)


def is_quorate():
def is_quorate(peer=None):
"""
Check if cluster is quorated
"""
out = get_stdout_or_raise_error("corosync-quorumtool -s", success_val_list=[0, 2])
out = get_stdout_or_raise_error("corosync-quorumtool -s", remote=peer, success_val_list=[0, 2])
res = re.search(r'Quorate:\s+(.*)', out)
if res:
return res.group(1) == "Yes"
Expand All @@ -3270,7 +3285,7 @@ def get_pcmk_delay_max(two_node_without_qdevice=False):
return 0


def get_property(name, property_type="crm_config"):
def get_property(name, property_type="crm_config", peer=None):
"""
Get cluster properties
Expand All @@ -3281,8 +3296,7 @@ def get_property(name, property_type="crm_config"):
cmd = "CIB_file={} sudo --preserve-env=CIB_file crm configure get_property {}".format(cib_path, name)
else:
cmd = "sudo crm_attribute -t {} -n {} -Gq".format(property_type, name)
rc, stdout, _ = get_stdout_stderr(cmd)
return stdout if rc == 0 else None
return get_stdout_or_raise_error(cmd, remote=peer, no_raise=True)


def check_no_quorum_policy_with_dlm():
Expand Down Expand Up @@ -3428,7 +3442,7 @@ def detect_file(_file, remote=None):
return rc


def check_function_with_timeout(check_function, wait_timeout=30, interval=1):
def check_function_with_timeout(check_function, wait_timeout=30, interval=1, *args, **kwargs):
"""
Run check_function in a loop
Return when check_function is true
Expand All @@ -3437,7 +3451,7 @@ def check_function_with_timeout(check_function, wait_timeout=30, interval=1):
current_time = int(time.time())
timeout = current_time + wait_timeout
while current_time <= timeout:
if check_function():
if check_function(*args, **kwargs):
return
time.sleep(interval)
current_time = int(time.time())
Expand Down
14 changes: 14 additions & 0 deletions test/features/bootstrap_bugs.feature
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,20 @@ Feature: Regression test for bootstrap bugs
When Run "crm cluster stop" on "hanode1"
Then Service "corosync" is "stopped" on "hanode1"

@clean
Scenario: Can't stop all nodes' cluster service when local node's service is down(bsc#1213889)
Given Cluster service is "stopped" on "hanode1"
And Cluster service is "stopped" on "hanode2"
When Run "crm cluster init -y" on "hanode1"
Then Cluster service is "started" on "hanode1"
When Run "crm cluster join -c hanode1 -y" on "hanode2"
Then Cluster service is "started" on "hanode2"
When Wait for DC
And Run "crm cluster stop" on "hanode1"
And Run "crm cluster stop --all" on "hanode1"
Then Cluster service is "stopped" on "hanode1"
And Cluster service is "stopped" on "hanode2"

@skip_non_root
@clean
Scenario: crm cluster join default behavior change in ssh key handling (bsc#1210693)
Expand Down
107 changes: 73 additions & 34 deletions test/unittests/test_ui_cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,52 +80,91 @@ def test_do_start(self, mock_parse_nodes, mock_active, mock_start, mock_qdevice_
mock_qdevice_configured.assert_called_once_with()
mock_info.assert_called_once_with("The cluster stack started on node1")

@mock.patch('logging.Logger.info')
@mock.patch('crmsh.utils.service_is_active')
@mock.patch('crmsh.ui_cluster.Cluster._wait_for_dc')
@mock.patch('crmsh.ui_cluster.Cluster._node_ready_to_stop_cluster_service')
@mock.patch('crmsh.ui_cluster.parse_option_for_nodes')
def test_do_stop_already_stopped(self, mock_parse_nodes, mock_active, mock_info):
def test_do_stop_return(self, mock_parse_nodes, mock_node_ready_to_stop_cluster_service, mock_dc):
mock_parse_nodes.return_value = ["node1", "node2"]
mock_node_ready_to_stop_cluster_service.side_effect = [False, False]

context_inst = mock.Mock()
mock_parse_nodes.return_value = ["node1"]
mock_active.side_effect = [False, False]
self.ui_cluster_inst.do_stop(context_inst, "node1")
mock_active.assert_has_calls([
mock.call("corosync.service", remote_addr="node1"),
mock.call("sbd.service", remote_addr="node1")
])
mock_info.assert_called_once_with("The cluster stack already stopped on node1")
self.ui_cluster_inst.do_stop(context_inst, "node1", "node2")

mock_parse_nodes.assert_called_once_with(context_inst, "node1", "node2")
mock_node_ready_to_stop_cluster_service.assert_has_calls([mock.call("node1"), mock.call("node2")])
mock_dc.assert_not_called()

@mock.patch('logging.Logger.debug')
@mock.patch('logging.Logger.info')
@mock.patch('crmsh.utils.stop_service')
@mock.patch('crmsh.utils.set_dlm_option')
@mock.patch('crmsh.utils.is_quorate')
@mock.patch('crmsh.utils.is_dlm_running')
@mock.patch('crmsh.utils.get_dc')
@mock.patch('crmsh.utils.check_function_with_timeout')
@mock.patch('crmsh.utils.get_property')
@mock.patch('crmsh.utils.service_is_active')
@mock.patch('crmsh.utils.stop_service')
@mock.patch('crmsh.ui_cluster.Cluster._set_dlm')
@mock.patch('crmsh.ui_cluster.Cluster._wait_for_dc')
@mock.patch('crmsh.ui_cluster.Cluster._node_ready_to_stop_cluster_service')
@mock.patch('crmsh.ui_cluster.parse_option_for_nodes')
def test_do_stop(self, mock_parse_nodes, mock_active, mock_get_property, mock_check, mock_get_dc, mock_dlm_running, mock_is_quorate, mock_set_dlm, mock_stop, mock_info, mock_debug):
def test_do_stop(self, mock_parse_nodes, mock_node_ready_to_stop_cluster_service, mock_dc,
mock_set_dlm, mock_stop, mock_is_active, mock_info, mock_debug):
mock_parse_nodes.return_value = ["node1", "node2"]
mock_node_ready_to_stop_cluster_service.side_effect = [True, False]
mock_stop.side_effect = [["node1"], ["node1"], ["node1"]]
mock_is_active.return_value = True

context_inst = mock.Mock()
mock_stop.side_effect = [["node1"], ["ndoe1"], ["node1"]]
mock_parse_nodes.return_value = ["node1"]
mock_active.side_effect = [True, True, True]
mock_dlm_running.return_value = True
mock_is_quorate.return_value = False
mock_get_property.return_value = "20s"
self.ui_cluster_inst.do_stop(context_inst, "node1", "node2")

self.ui_cluster_inst.do_stop(context_inst, "node1")
mock_parse_nodes.assert_called_once_with(context_inst, "node1", "node2")
mock_node_ready_to_stop_cluster_service.assert_has_calls([mock.call("node1"), mock.call("node2")])
mock_debug.assert_called_once_with("stop node list: ['node1']")
mock_dc.assert_called_once_with("node1")
mock_set_dlm.assert_called_once_with("node1")
mock_stop.assert_has_calls([
mock.call("pacemaker", node_list=["node1"]),
mock.call("corosync-qdevice.service", node_list=["node1"]),
mock.call("corosync", node_list=["node1"]),
])
mock_info.assert_called_once_with("The cluster stack stopped on node1")

mock_active.assert_has_calls([
@mock.patch('logging.Logger.info')
@mock.patch('crmsh.utils.stop_service')
@mock.patch('crmsh.utils.service_is_active')
def test__node_ready_to_stop_cluster_service_corosync(self, mock_is_active, mock_stop, mock_info):
mock_is_active.side_effect = [False, True, False]
res = self.ui_cluster_inst._node_ready_to_stop_cluster_service("node1")
assert res is False
mock_is_active.assert_has_calls([
mock.call("corosync.service", remote_addr="node1"),
mock.call("sbd.service", remote_addr="node1"),
mock.call("pacemaker.service", remote_addr="node1"),
mock.call("corosync-qdevice.service")
])
mock_stop.assert_has_calls([
mock.call("pacemaker", node_list=["node1"]),
mock.call("corosync-qdevice.service", node_list=["node1"]),
mock.call("corosync", node_list=["node1"])
mock_stop.assert_called_once_with("corosync", remote_addr="node1")
mock_info.assert_called_once_with("The cluster stack stopped on node1")

@mock.patch('logging.Logger.info')
@mock.patch('crmsh.utils.stop_service')
@mock.patch('crmsh.utils.service_is_active')
def test__node_ready_to_stop_cluster_service_pacemaker(self, mock_is_active, mock_stop, mock_info):
mock_is_active.side_effect = [True, True, False]
res = self.ui_cluster_inst._node_ready_to_stop_cluster_service("node1")
assert res is False
mock_is_active.assert_has_calls([
mock.call("corosync.service", remote_addr="node1"),
mock.call("sbd.service", remote_addr="node1"),
mock.call("pacemaker.service", remote_addr="node1"),
])
mock_stop.assert_called_once_with("corosync", remote_addr="node1")
mock_info.assert_called_once_with("The cluster stack stopped on node1")
mock_debug.assert_called_once_with("Quorum is lost; Set enable_quorum_fencing=0 and enable_quorum_lockspace=0 for dlm")
mock_check.assert_called_once_with(mock_get_dc, wait_timeout=25)

@mock.patch('logging.Logger.info')
@mock.patch('crmsh.utils.stop_service')
@mock.patch('crmsh.utils.service_is_active')
def test__node_ready_to_stop_cluster_service(self, mock_is_active, mock_stop, mock_info):
mock_is_active.side_effect = [True, True, True]
res = self.ui_cluster_inst._node_ready_to_stop_cluster_service("node1")
assert res is True
mock_is_active.assert_has_calls([
mock.call("corosync.service", remote_addr="node1"),
mock.call("sbd.service", remote_addr="node1"),
mock.call("pacemaker.service", remote_addr="node1"),
])
mock_info.assert_not_called()
mock_stop.assert_not_called()
Loading

0 comments on commit 89bb8cc

Please sign in to comment.