clean: clean types and refactor some functions (#208)

Another cleaning PR: - Remove useless types and import - Add a few type hints - Slight refactor of some functions
datagouv · Oct 30, 2024 · 7c83764 · 7c83764
1 parent a8932fe
commit 7c83764
Show file tree

Hide file tree

Showing 3 changed files with 30 additions and 34 deletions.
diff --git a/udata_hydra/crawl/helpers.py b/udata_hydra/crawl/helpers.py
@@ -1,7 +1,6 @@
 from datetime import datetime, timedelta, timezone
-from typing import Any, Optional, Tuple
+from typing import Any, Tuple
 
-from aiohttp import web
 from multidict import CIMultiDictProxy
 
 from udata_hydra import config, context
@@ -55,7 +54,7 @@ def has_nice_head(resp) -> bool:
     return True
 
 
-def is_valid_status(status: str) -> Optional[bool]:
+def is_valid_status(status: str) -> bool | None:
     if not status:
         return False
     status_nb = int(status)
@@ -65,17 +64,19 @@ def is_valid_status(status: str) -> Optional[bool]:
     return status_nb >= 200 and status_nb < 400
 
 
-async def is_domain_backoff(domain: str) -> Tuple[bool, str]:
+async def is_domain_backoff(domain: str) -> tuple[bool, str]:
     """Check if we should not crawl on this domain, in order to avoid 429 errors/bans as much as we can. We backoff if:
     - we have hit a 429
     - we have hit the rate limit on our side
     Returns a tuple with if it should backoff or not (boolean) and the reason why (string)
     """
-    backoff = (False, "")
-    no_backoff = config.NO_BACKOFF_DOMAINS
-    if domain in no_backoff:
+    backoff: tuple = (False, "")
+
+    if domain in config.NO_BACKOFF_DOMAINS:
         return backoff
+
     since_backoff_period = datetime.now(timezone.utc) - timedelta(seconds=config.BACKOFF_PERIOD)
+
     pool = await context.pool()
     async with pool.acquire() as connection:
         # check if we trigger BACKOFF_NB_REQ for BACKOFF_PERIOD on this domain

diff --git a/udata_hydra/crawl/process_check_data.py b/udata_hydra/crawl/process_check_data.py
@@ -1,6 +1,5 @@
 import json
 from datetime import datetime, timezone
-from typing import Optional, Tuple
 
 from asyncpg import Record
 
@@ -10,7 +9,7 @@
 from udata_hydra.utils import queue, send
 
 
-async def process_check_data(check_data: dict) -> Tuple[Record, bool]:
+async def process_check_data(check_data: dict) -> tuple[Record, bool]:
     """Preprocess a check before saving it
     Return the check and a boolean indicating if it's the first check for this resource"""
 
@@ -34,7 +33,7 @@ async def process_check_data(check_data: dict) -> Tuple[Record, bool]:
     return await Check.insert(check_data), is_first_check
 
 
-async def has_check_changed(check_data: dict, last_check: Optional[dict]) -> bool:
+async def has_check_changed(check_data: dict, last_check: dict | None) -> bool:
     """Check if the check has changed compared to the last one"""
 
     is_first_check: bool = last_check is None

diff --git a/udata_hydra/routes/status.py b/udata_hydra/routes/status.py
@@ -8,27 +8,6 @@
 from udata_hydra.worker import QUEUES
 
 
-async def get_status_counts(request: web.Request) -> dict[str, int]:
-    pool = request.app["pool"]
-
-    status_counts = {status: 0 for status in Resource.STATUSES}
-    status_counts[None] = 0
-
-    async with pool.acquire() as connection:
-        q = """
-            SELECT COALESCE(status, 'NULL') AS status, COUNT(*) AS count
-            FROM catalog
-            GROUP BY COALESCE(status, 'NULL');
-        """
-        rows = await connection.fetch(q)
-
-        for row in rows:
-            status = row["status"] if row["status"] != "NULL" else None
-            status_counts[status] = row["count"]
-
-        return status_counts
-
-
 async def get_crawler_status(request: web.Request) -> web.Response:
     q = f"""
         SELECT
@@ -40,8 +19,8 @@ async def get_crawler_status(request: web.Request) -> web.Response:
     """
     stats_catalog = await request.app["pool"].fetchrow(q)
 
-    since = parse_timespan(config.SINCE)
-    since = datetime.now(timezone.utc) - timedelta(seconds=since)
+    since_seconds: float = parse_timespan(config.SINCE)
+    since: datetime = datetime.now(timezone.utc) - timedelta(seconds=since_seconds)
     q = f"""
         SELECT
             SUM(CASE WHEN checks.created_at <= $1 THEN 1 ELSE 0 END) AS count_outdated
@@ -60,14 +39,31 @@ async def get_crawler_status(request: web.Request) -> web.Response:
     rate_checked = round(stats_catalog["count_checked"] / total * 100, 1)
     rate_checked_fresh = round(count_checked / total * 100, 1)
 
+    async def get_resources_status_counts(request: web.Request) -> dict[str | None, int]:
+        status_counts: dict = {status: 0 for status in Resource.STATUSES}
+        status_counts[None] = 0
+
+        q = """
+            SELECT COALESCE(status, 'NULL') AS status, COUNT(*) AS count
+            FROM catalog
+            GROUP BY COALESCE(status, 'NULL');
+        """
+        rows = await request.app["pool"].fetch(q)
+
+        for row in rows:
+            status = row["status"] if row["status"] != "NULL" else None
+            status_counts[status] = row["count"]
+
+        return status_counts
+
     return web.json_response(
         {
             "total": total,
             "pending_checks": count_left,
             "fresh_checks": count_checked,
             "checks_percentage": rate_checked,
             "fresh_checks_percentage": rate_checked_fresh,
-            "resources_statuses_count": await get_status_counts(request),
+            "resources_statuses_count": await get_resources_status_counts(request),
         }
     )