Skip to content

Commit

Permalink
refactor: working upgrades
Browse files Browse the repository at this point in the history
  • Loading branch information
marcoppenheimer committed Jan 1, 2024
1 parent 63580c4 commit 8bfb321
Show file tree
Hide file tree
Showing 2 changed files with 143 additions and 6 deletions.
13 changes: 7 additions & 6 deletions src/charm.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@
from managers.config import ConfigManager
from managers.quorum import QuorumManager
from managers.tls import TLSManager
from upgrade import ZooKeeperDependencyModel, ZooKeeperUpgrade
from events.upgrade import ZooKeeperDependencyModel, ZooKeeperEvents

logger = logging.getLogger(__name__)

Expand All @@ -53,7 +53,8 @@ def __init__(self, *args):
self.state = ClusterState(self, substrate=SUBSTRATE)
self.workload = ZKServiceVM()

# --- HANDLERS ---
# --- CHARM EVENT HANDLERS ---

self.tls_events = TLSEvents(self)
self.provider_events = ProviderEvents(self)

Expand All @@ -68,7 +69,7 @@ def __init__(self, *args):
)
# TODO: add upgrade manager?

# --- OTHER ---
# --- LIB EVENT HANDLERS ---

self.restart = RollingOpsManager(self, relation="restart", callback=self._restart)
self._grafana_agent = COSAgentProvider(
Expand All @@ -81,14 +82,14 @@ def __init__(self, *args):
logs_rules_dir="./src/alert_rules/loki",
log_slots=["charmed-zookeeper:logs"],
)
self.upgrade = ZooKeeperUpgrade(
self.upgrade = ZooKeeperEvents(
self,
dependency_model=ZooKeeperDependencyModel(
**DEPENDENCIES # pyright: ignore[reportGeneralTypeIssues]
),
)

# --- EVENTS ---
# --- CORE EVENTS ---

self.framework.observe(getattr(self.on, "install"), self._on_install)
self.framework.observe(getattr(self.on, "start"), self._manual_restart)
Expand Down Expand Up @@ -119,7 +120,7 @@ def __init__(self, *args):
)
self.framework.observe(getattr(self.on, "set_password_action"), self._set_password_action)

# --- EVENT HANDLERS ---
# --- CORE EVENT HANDLERS ---

def _on_install(self, event: InstallEvent) -> None:
"""Handler for the `on_install` event."""
Expand Down
136 changes: 136 additions & 0 deletions src/events/upgrade.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,136 @@
from functools import cached_property
from typing_extensions import override
from kazoo.client import ConnectionClosedError
from typing import TYPE_CHECKING
import logging

from charms.data_platform_libs.v0.upgrade import BaseModel, ClusterNotReadyError, DataUpgrade, DependencyModel, UpgradeGrantedEvent
from charms.zookeeper.v0.client import QuorumLeaderNotFoundError, ZooKeeperManager
from literals import CLIENT_PORT

if TYPE_CHECKING:
from charm import ZooKeeperCharm

logger = logging.getLogger(__name__)


class ZooKeeperDependencyModel(BaseModel):
"""Model for ZooKeeper Operator dependencies."""

service: DependencyModel


class ZooKeeperEvents(DataUpgrade):
"""Implementation of :class:`DataUpgrade` overrides for in-place upgrades."""

def __init__(self, charm: "ZooKeeperCharm", **kwargs):
super().__init__(charm, **kwargs)
self.charm = charm

@property
def idle(self) -> bool:
"""Checks if cluster state is idle.
Returns:
True if cluster state is idle. Otherwise False
"""
return self.cluster_state == "idle"

@cached_property
def client(self) -> ZooKeeperManager:
"""Cached client manager application for performing ZK commands."""
return ZooKeeperManager(
hosts=[server.host for server in self.charm.state.started_servers],
client_port=CLIENT_PORT,
username="super",
password=self.charm.state.cluster.internal_user_credentials.get("super", ""),
)

def post_upgrade_check(self) -> None:
"""Runs necessary checks validating the unit is in a healthy state after upgrade."""
self.pre_upgrade_check()

@override
def pre_upgrade_check(self) -> None:
default_message = "Pre-upgrade check failed and cannot safely upgrade"
try:
if not self.client.members_broadcasting or not len(self.client.server_members) == len(
self.charm.state.servers
):
raise ClusterNotReadyError(
message=default_message,
cause="Not all application units are connected and broadcasting in the quorum",
)

if self.client.members_syncing:
raise ClusterNotReadyError(
message=default_message, cause="Some quorum members are syncing data"
)

if not self.charm.state.stable:
raise ClusterNotReadyError(
message=default_message, cause="Charm has not finished initialising"
)

except QuorumLeaderNotFoundError:
raise ClusterNotReadyError(message=default_message, cause="Quorum leader not found")
except ConnectionClosedError:
raise ClusterNotReadyError(
message=default_message, cause="Unable to connect to the cluster"
)
except Exception as e:
logger.debug(str(e))
raise ClusterNotReadyError(message=default_message, cause="Unknown error")

@override
def build_upgrade_stack(self) -> list[int]:
upgrade_stack = []
for server in self.charm.state.servers:
# upgrade quorum leader last
if server.host == self.client.leader:
upgrade_stack.insert(0, server.unit_id)
else:
upgrade_stack.append(server.unit_id)

return upgrade_stack

@override
def log_rollback_instructions(self) -> None:
logger.critical(
"\n".join(
[
"Unit failed to upgrade and requires manual rollback to previous stable version.",
" 1. Re-run `pre-upgrade-check` action on the leader unit to enter 'recovery' state",
" 2. Run `juju refresh` to the previously deployed charm revision",
]
)
)
return

@override
def _on_upgrade_granted(self, event: UpgradeGrantedEvent) -> None:
self.charm.snap.stop_snap_service()

if not self.charm.snap.install():
logger.error("Unable to install ZooKeeper Snap")
self.set_unit_failed()
return

logger.info(f"{self.charm.unit.name} upgrading service...")
self.charm.snap.restart_snap_service()

try:
logger.debug("Running post-upgrade check...")
self.post_upgrade_check()

logger.debug("Marking unit completed...")
self.set_unit_completed()

# ensures leader gets it's own relation-changed when it upgrades
if self.charm.unit.is_leader():
logger.debug("Re-emitting upgrade-changed on leader...")
self.on_upgrade_changed(event)

except ClusterNotReadyError as e:
logger.error(e.cause)
self.set_unit_failed()

0 comments on commit 8bfb321

Please sign in to comment.