Skip to content

Commit

Permalink
clean: clean types and refactor some functions (#208)
Browse files Browse the repository at this point in the history
Another cleaning PR:
- Remove useless types and import
- Add a few type hints
- Slight refactor of some functions
  • Loading branch information
bolinocroustibat authored Oct 30, 2024
1 parent a8932fe commit 7c83764
Show file tree
Hide file tree
Showing 3 changed files with 30 additions and 34 deletions.
15 changes: 8 additions & 7 deletions udata_hydra/crawl/helpers.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
from datetime import datetime, timedelta, timezone
from typing import Any, Optional, Tuple
from typing import Any, Tuple

from aiohttp import web
from multidict import CIMultiDictProxy

from udata_hydra import config, context
Expand Down Expand Up @@ -55,7 +54,7 @@ def has_nice_head(resp) -> bool:
return True


def is_valid_status(status: str) -> Optional[bool]:
def is_valid_status(status: str) -> bool | None:
if not status:
return False
status_nb = int(status)
Expand All @@ -65,17 +64,19 @@ def is_valid_status(status: str) -> Optional[bool]:
return status_nb >= 200 and status_nb < 400


async def is_domain_backoff(domain: str) -> Tuple[bool, str]:
async def is_domain_backoff(domain: str) -> tuple[bool, str]:
"""Check if we should not crawl on this domain, in order to avoid 429 errors/bans as much as we can. We backoff if:
- we have hit a 429
- we have hit the rate limit on our side
Returns a tuple with if it should backoff or not (boolean) and the reason why (string)
"""
backoff = (False, "")
no_backoff = config.NO_BACKOFF_DOMAINS
if domain in no_backoff:
backoff: tuple = (False, "")

if domain in config.NO_BACKOFF_DOMAINS:
return backoff

since_backoff_period = datetime.now(timezone.utc) - timedelta(seconds=config.BACKOFF_PERIOD)

pool = await context.pool()
async with pool.acquire() as connection:
# check if we trigger BACKOFF_NB_REQ for BACKOFF_PERIOD on this domain
Expand Down
5 changes: 2 additions & 3 deletions udata_hydra/crawl/process_check_data.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
import json
from datetime import datetime, timezone
from typing import Optional, Tuple

from asyncpg import Record

Expand All @@ -10,7 +9,7 @@
from udata_hydra.utils import queue, send


async def process_check_data(check_data: dict) -> Tuple[Record, bool]:
async def process_check_data(check_data: dict) -> tuple[Record, bool]:
"""Preprocess a check before saving it
Return the check and a boolean indicating if it's the first check for this resource"""

Expand All @@ -34,7 +33,7 @@ async def process_check_data(check_data: dict) -> Tuple[Record, bool]:
return await Check.insert(check_data), is_first_check


async def has_check_changed(check_data: dict, last_check: Optional[dict]) -> bool:
async def has_check_changed(check_data: dict, last_check: dict | None) -> bool:
"""Check if the check has changed compared to the last one"""

is_first_check: bool = last_check is None
Expand Down
44 changes: 20 additions & 24 deletions udata_hydra/routes/status.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,27 +8,6 @@
from udata_hydra.worker import QUEUES


async def get_status_counts(request: web.Request) -> dict[str, int]:
pool = request.app["pool"]

status_counts = {status: 0 for status in Resource.STATUSES}
status_counts[None] = 0

async with pool.acquire() as connection:
q = """
SELECT COALESCE(status, 'NULL') AS status, COUNT(*) AS count
FROM catalog
GROUP BY COALESCE(status, 'NULL');
"""
rows = await connection.fetch(q)

for row in rows:
status = row["status"] if row["status"] != "NULL" else None
status_counts[status] = row["count"]

return status_counts


async def get_crawler_status(request: web.Request) -> web.Response:
q = f"""
SELECT
Expand All @@ -40,8 +19,8 @@ async def get_crawler_status(request: web.Request) -> web.Response:
"""
stats_catalog = await request.app["pool"].fetchrow(q)

since = parse_timespan(config.SINCE)
since = datetime.now(timezone.utc) - timedelta(seconds=since)
since_seconds: float = parse_timespan(config.SINCE)
since: datetime = datetime.now(timezone.utc) - timedelta(seconds=since_seconds)
q = f"""
SELECT
SUM(CASE WHEN checks.created_at <= $1 THEN 1 ELSE 0 END) AS count_outdated
Expand All @@ -60,14 +39,31 @@ async def get_crawler_status(request: web.Request) -> web.Response:
rate_checked = round(stats_catalog["count_checked"] / total * 100, 1)
rate_checked_fresh = round(count_checked / total * 100, 1)

async def get_resources_status_counts(request: web.Request) -> dict[str | None, int]:
status_counts: dict = {status: 0 for status in Resource.STATUSES}
status_counts[None] = 0

q = """
SELECT COALESCE(status, 'NULL') AS status, COUNT(*) AS count
FROM catalog
GROUP BY COALESCE(status, 'NULL');
"""
rows = await request.app["pool"].fetch(q)

for row in rows:
status = row["status"] if row["status"] != "NULL" else None
status_counts[status] = row["count"]

return status_counts

return web.json_response(
{
"total": total,
"pending_checks": count_left,
"fresh_checks": count_checked,
"checks_percentage": rate_checked,
"fresh_checks_percentage": rate_checked_fresh,
"resources_statuses_count": await get_status_counts(request),
"resources_statuses_count": await get_resources_status_counts(request),
}
)

Expand Down

0 comments on commit 7c83764

Please sign in to comment.