fixup! Add scraper for new Scottish Parliament site

mysociety · Apr 19, 2024 · 2940dbf · 2940dbf
1 parent 3c19e8e
commit 2940dbf
Show file tree

Hide file tree

Showing 2 changed files with 22 additions and 23 deletions.
diff --git a/pyscraper/sp_2024/__main__.py b/pyscraper/sp_2024/__main__.py
@@ -6,7 +6,7 @@
 
 from __future__ import annotations
 
-from .download import fetch_debates_for_date
+from .download import fetch_debates_for_dates
 from .parse import parse_downloaded, tidy_up_html
 from .convert import convert_to_twfy
 import click
@@ -35,12 +35,12 @@ def fetch_debates_on_date(date: str, verbose: bool = False, override: bool = Fal
     """
 
     try:
-        datetime.datetime.fromisoformat(date)
+        datetime.date.fromisoformat(date)
     except ValueError:
         print(f"{date} is not a valid iso date")
 
-    for file in fetch_debates_for_date(
-        date, verbose=verbose, cache_dir=cache_dir, override=override
+    for file in fetch_debates_for_dates(
+        date, date, verbose=verbose, cache_dir=cache_dir, override=override
     ):
         tidy_up_html(file)
         convert_to_twfy(file, output_dir)
@@ -59,14 +59,12 @@ def fetch_debates_on_date_range(
     """
     Download transcripts from Scottish Parliament between a start and end date
     """
-    start = datetime.datetime.fromisoformat(start_date)
-    end = datetime.datetime.fromisoformat(end_date)
-    for n in range(int((end - start).days) + 1):
-        date = (start + datetime.timedelta(n)).date().isoformat()
-        for file in fetch_debates_for_date(
-            date, verbose=verbose, cache_dir=cache_dir, override=override
-        ):
-            tidy_up_html(file)
+    start = datetime.date.fromisoformat(start_date)
+    end = datetime.date.fromisoformat(end_date)
+    for file in fetch_debates_for_dates(
+        start.isoformat(), end.isoformat(), verbose=verbose, cache_dir=cache_dir, override=override
+    ):
+        tidy_up_html(file)
 
 
 @cli.command()

diff --git a/pyscraper/sp_2024/download.py b/pyscraper/sp_2024/download.py
@@ -23,16 +23,16 @@
 )
 
 scot_prefix = "https://www.parliament.scot"
-search_url = "/chamber-and-committees/official-report/search-what-was-said-in-parliament?msp=&committeeSelect=&qry=&dateSelect=custom&dtDateFrom={iso_date}&dtDateTo={iso_date}&showPlenary=true&ShowDebates=true&ShowFMQs=true&ShowGeneralQuestions=true&ShowPortfolioQuestions=true&ShowSPCBQuestions=true&ShowTopicalQuestions=true&ShowUrgentQuestions=true&showCommittee=true&page=1"
+search_url = "/chamber-and-committees/official-report/search-what-was-said-in-parliament?msp=&committeeSelect=&qry=&dateSelect=custom&dtDateFrom={start_date}&dtDateTo={end_date}&showPlenary=true&ShowDebates=true&ShowFMQs=true&ShowGeneralQuestions=true&ShowPortfolioQuestions=true&ShowSPCBQuestions=true&ShowTopicalQuestions=true&ShowUrgentQuestions=true&showCommittee=true&page={page}"
 item_url = "/chamber-and-committees/official-report/search-what-was-said-in-parliament/{slug}?meeting={id}&iob={iob}"
 major_heading_url = "/chamber-and-committees/official-report/search-what-was-said-in-parliament/{slug}?meeting={id}"
 
 
-def get_meeting_urls(date: str, page: int = 1):
+def get_meeting_urls(start_date: str, end_date: str, page: int = 1):
     """
-    Query the Scottish Parliament search page for a given date to get all links for agenda meetings
+    Query the Scottish Parliament search page for a given date range to get all links for agenda meetings
     """
-    date_url = scot_prefix + search_url.format(iso_date=date, page=page)
+    date_url = scot_prefix + search_url.format(start_date=start_date, end_date=end_date, page=page)
     response = requests.get(date_url, headers={"User-Agent": user_agent})
 
     # extract all urls that contain /search-what-was-said-in-parliament/
@@ -55,17 +55,17 @@ def get_meeting_urls(date: str, page: int = 1):
     return heading_count, meeting_urls
 
 
-def get_debate_groupings(date: str) -> list[DebateGrouping]:
+def get_debate_groupings(start_date: str, end_date: str) -> list[DebateGrouping]:
     """
-    Query the search page and get the urls for the meetings on that date
+    Query the search page and get the urls for the meetings in that date range
     """
 
     keep_fetching = True
     search_page = 1
     meeting_urls = []
 
     while keep_fetching:
-        heading_count, page_result_urls = get_meeting_urls(date, search_page)
+        heading_count, page_result_urls = get_meeting_urls(start_date, end_date, search_page)
         meeting_urls.extend(page_result_urls)
         if heading_count < 10:
             keep_fetching = False
@@ -78,6 +78,7 @@ def get_committee_slug(url: str):
     groupings = []
 
     for g, items in groupby(meeting_urls, key=get_committee_slug):
+        date = '-'.join(reversed(g.split('-')[-3:]))
         url_ids = []
         url_iobs = []
 
@@ -212,14 +213,14 @@ def save_xml(self, cache_dir: Path, override: bool = False) -> Path:
         return filename
 
 
-def fetch_debates_for_date(
-    date: str, cache_dir: Path, verbose: bool = False, override: bool = False
+def fetch_debates_for_dates(
+    start_date: str, end_date: str, cache_dir: Path, verbose: bool = False, override: bool = False
 ):
     """
-    Fetch debates across all chambers for a given date
+    Fetch debates across all chambers for a given date range
     """
     cache_dir.mkdir(parents=True, exist_ok=True)
-    for grouping in get_debate_groupings(date):
+    for grouping in get_debate_groupings(start_date, end_date):
         if verbose:
             print(f"Fetching debates for {grouping.committee_date_slug}")
         yield grouping.save_xml(cache_dir=cache_dir, override=override)