Skip to content

Commit

Permalink
reduce noise of zenduty alerts (#70)
Browse files Browse the repository at this point in the history
* reduce noise of zenduty alerts

* keep resolution threshold at 2 minutes

* fix zenduty ratelimited log

* ensure event is dropped from memory when resolved

* update comment

* bump version
  • Loading branch information
ayazabbas authored May 17, 2024
1 parent 7088a26 commit d8ca2b6
Show file tree
Hide file tree
Showing 3 changed files with 34 additions and 17 deletions.
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ ignore_missing_imports = true

[tool.poetry]
name = "pyth-observer"
version = "0.2.6"
version = "0.2.7"
description = "Alerts and stuff"
authors = []
readme = "README.md"
Expand Down
44 changes: 30 additions & 14 deletions pyth_observer/dispatch.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,7 @@
from pyth_observer.event import DatadogEvent # Used dynamically
from pyth_observer.event import LogEvent # Used dynamically
from pyth_observer.event import TelegramEvent # Used dynamically
from pyth_observer.event import ZendutyEvent # Used dynamically
from pyth_observer.event import Event
from pyth_observer.event import Context, Event, ZendutyEvent
from pyth_observer.zenduty import send_zenduty_alert

assert DatadogEvent
Expand Down Expand Up @@ -46,6 +45,9 @@ def __init__(self, config, publishers):
if "ZendutyEvent" in self.config["events"]:
self.open_alerts_file = os.environ["OPEN_ALERTS_FILE"]
self.open_alerts = self.load_alerts()
# below is used to store events to later send if mutilple failures occur
# events cannot be stored in open_alerts as they are not JSON serializable.
self.zenduty_events = {}

def load_alerts(self):
try:
Expand All @@ -68,17 +70,14 @@ async def run(self, states: List[State]):

# Then, wrap each failed check in events and send them
sent_events: List[Awaitable] = []
context = {
"network": self.config["network"]["name"],
"publishers": self.publishers,
}
context = Context(
network=self.config["network"]["name"], publishers=self.publishers
)

for check in failed_checks:
for event_type in self.config["events"]:
event: Event = globals()[event_type](check, context)

sent_events.append(event.send())

if event_type == "ZendutyEvent":
# Add failed check to open alerts
alert_identifier = (
Expand All @@ -87,28 +86,45 @@ async def run(self, states: List[State]):
state = check.state()
if isinstance(state, PublisherState):
alert_identifier += f"-{state.publisher_name}"
self.open_alerts[alert_identifier] = datetime.now().isoformat()
try:
failures = self.open_alerts[alert_identifier]["failures"] + 1
except KeyError:
failures = 1
self.open_alerts[alert_identifier] = {
"last_failure": datetime.now().isoformat(),
"failures": failures,
}
# store the event to send it later if it fails multiple times
self.zenduty_events[alert_identifier] = event
continue # do not immediately send a zenduty alert

sent_events.append(event.send())

await asyncio.gather(*sent_events)

# Check open alerts and resolve those that are older than 2 minutes
# Check open alerts for zenduty
if "ZendutyEvent" in self.config["events"]:

to_remove = []
current_time = datetime.now()
for identifier, last_failure in self.open_alerts.items():
if current_time - datetime.fromisoformat(last_failure) >= timedelta(
minutes=2
):
for identifier, info in self.open_alerts.items():
# Resolve the alert if it last failed > 2 minutes ago
if current_time - datetime.fromisoformat(
info["last_failure"]
) >= timedelta(minutes=2):
logger.debug(f"Resolving Zenduty alert {identifier}")
response = await send_zenduty_alert(
alert_identifier=identifier, message=identifier, resolved=True
)
if response and 200 <= response.status < 300:
to_remove.append(identifier)
elif info["failures"] > 2:
# Raise alert if the check has failed more than twice before self-resolving
await self.zenduty_events[identifier].send()

for identifier in to_remove:
del self.open_alerts[identifier]
del self.zenduty_events[identifier]

# Write open alerts to file to ensure persistence
with open(self.open_alerts_file, "w") as file:
Expand Down
5 changes: 3 additions & 2 deletions pyth_observer/zenduty.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,11 +33,12 @@ async def send_zenduty_alert(alert_identifier, message, resolved=False, summary=
elif response.status == 429:
retries += 1
if retries < max_retries:
sleeptime = min(30, 2**retries)
logger.error(
f"Received 429 Too Many Requests for {alert_identifier}. Retrying in 1 second..."
f"Received 429 Too Many Requests for {alert_identifier}. Retrying in {sleeptime} s..."
)
await asyncio.sleep(
min(30, 2**retries)
sleeptime
) # Backoff before retrying, wait upto 30s
else:
logger.error(
Expand Down

0 comments on commit d8ca2b6

Please sign in to comment.