Skip to content

Commit

Permalink
fixup! Add scraper for new Scottish Parliament site
Browse files Browse the repository at this point in the history
  • Loading branch information
dracos committed Apr 19, 2024
1 parent 3c19e8e commit 2940dbf
Show file tree
Hide file tree
Showing 2 changed files with 22 additions and 23 deletions.
22 changes: 10 additions & 12 deletions pyscraper/sp_2024/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

from __future__ import annotations

from .download import fetch_debates_for_date
from .download import fetch_debates_for_dates
from .parse import parse_downloaded, tidy_up_html
from .convert import convert_to_twfy
import click
Expand Down Expand Up @@ -35,12 +35,12 @@ def fetch_debates_on_date(date: str, verbose: bool = False, override: bool = Fal
"""

try:
datetime.datetime.fromisoformat(date)
datetime.date.fromisoformat(date)
except ValueError:
print(f"{date} is not a valid iso date")

for file in fetch_debates_for_date(
date, verbose=verbose, cache_dir=cache_dir, override=override
for file in fetch_debates_for_dates(
date, date, verbose=verbose, cache_dir=cache_dir, override=override
):
tidy_up_html(file)
convert_to_twfy(file, output_dir)
Expand All @@ -59,14 +59,12 @@ def fetch_debates_on_date_range(
"""
Download transcripts from Scottish Parliament between a start and end date
"""
start = datetime.datetime.fromisoformat(start_date)
end = datetime.datetime.fromisoformat(end_date)
for n in range(int((end - start).days) + 1):
date = (start + datetime.timedelta(n)).date().isoformat()
for file in fetch_debates_for_date(
date, verbose=verbose, cache_dir=cache_dir, override=override
):
tidy_up_html(file)
start = datetime.date.fromisoformat(start_date)
end = datetime.date.fromisoformat(end_date)
for file in fetch_debates_for_dates(
start.isoformat(), end.isoformat(), verbose=verbose, cache_dir=cache_dir, override=override
):
tidy_up_html(file)


@cli.command()
Expand Down
23 changes: 12 additions & 11 deletions pyscraper/sp_2024/download.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,16 +23,16 @@
)

scot_prefix = "https://www.parliament.scot"
search_url = "/chamber-and-committees/official-report/search-what-was-said-in-parliament?msp=&committeeSelect=&qry=&dateSelect=custom&dtDateFrom={iso_date}&dtDateTo={iso_date}&showPlenary=true&ShowDebates=true&ShowFMQs=true&ShowGeneralQuestions=true&ShowPortfolioQuestions=true&ShowSPCBQuestions=true&ShowTopicalQuestions=true&ShowUrgentQuestions=true&showCommittee=true&page=1"
search_url = "/chamber-and-committees/official-report/search-what-was-said-in-parliament?msp=&committeeSelect=&qry=&dateSelect=custom&dtDateFrom={start_date}&dtDateTo={end_date}&showPlenary=true&ShowDebates=true&ShowFMQs=true&ShowGeneralQuestions=true&ShowPortfolioQuestions=true&ShowSPCBQuestions=true&ShowTopicalQuestions=true&ShowUrgentQuestions=true&showCommittee=true&page={page}"
item_url = "/chamber-and-committees/official-report/search-what-was-said-in-parliament/{slug}?meeting={id}&iob={iob}"
major_heading_url = "/chamber-and-committees/official-report/search-what-was-said-in-parliament/{slug}?meeting={id}"


def get_meeting_urls(date: str, page: int = 1):
def get_meeting_urls(start_date: str, end_date: str, page: int = 1):
"""
Query the Scottish Parliament search page for a given date to get all links for agenda meetings
Query the Scottish Parliament search page for a given date range to get all links for agenda meetings
"""
date_url = scot_prefix + search_url.format(iso_date=date, page=page)
date_url = scot_prefix + search_url.format(start_date=start_date, end_date=end_date, page=page)
response = requests.get(date_url, headers={"User-Agent": user_agent})

# extract all urls that contain /search-what-was-said-in-parliament/
Expand All @@ -55,17 +55,17 @@ def get_meeting_urls(date: str, page: int = 1):
return heading_count, meeting_urls


def get_debate_groupings(date: str) -> list[DebateGrouping]:
def get_debate_groupings(start_date: str, end_date: str) -> list[DebateGrouping]:
"""
Query the search page and get the urls for the meetings on that date
Query the search page and get the urls for the meetings in that date range
"""

keep_fetching = True
search_page = 1
meeting_urls = []

while keep_fetching:
heading_count, page_result_urls = get_meeting_urls(date, search_page)
heading_count, page_result_urls = get_meeting_urls(start_date, end_date, search_page)
meeting_urls.extend(page_result_urls)
if heading_count < 10:
keep_fetching = False
Expand All @@ -78,6 +78,7 @@ def get_committee_slug(url: str):
groupings = []

for g, items in groupby(meeting_urls, key=get_committee_slug):
date = '-'.join(reversed(g.split('-')[-3:]))
url_ids = []
url_iobs = []

Expand Down Expand Up @@ -212,14 +213,14 @@ def save_xml(self, cache_dir: Path, override: bool = False) -> Path:
return filename


def fetch_debates_for_date(
date: str, cache_dir: Path, verbose: bool = False, override: bool = False
def fetch_debates_for_dates(
start_date: str, end_date: str, cache_dir: Path, verbose: bool = False, override: bool = False
):
"""
Fetch debates across all chambers for a given date
Fetch debates across all chambers for a given date range
"""
cache_dir.mkdir(parents=True, exist_ok=True)
for grouping in get_debate_groupings(date):
for grouping in get_debate_groupings(start_date, end_date):
if verbose:
print(f"Fetching debates for {grouping.committee_date_slug}")
yield grouping.save_xml(cache_dir=cache_dir, override=override)

0 comments on commit 2940dbf

Please sign in to comment.