Skip to content

Commit

Permalink
new england ski history timelines code
Browse files Browse the repository at this point in the history
  • Loading branch information
dhimmel committed Jan 26, 2025
1 parent 841a818 commit fe157d3
Show file tree
Hide file tree
Showing 7 changed files with 158 additions and 4 deletions.
10 changes: 10 additions & 0 deletions openskistats/commands.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
ski_rose_the_world,
)
from openskistats.models import RunModel, SkiAreaModel
from openskistats.nesh.timelines import NewEnglandSkiHistoryTimelineScraper
from openskistats.openskimap_utils import (
download_openskimap_geojsons,
generate_openskimap_test_data,
Expand All @@ -26,6 +27,15 @@ def download() -> None:
"""Download latest OpenSkiMap source data."""
download_openskimap_geojsons()

@staticmethod
@cli.command(name="nesh_timelines") # type: ignore [misc]
def nesh_timelines() -> None:
"""
Scrape New England Ski History Timelines and save as JSON in the repository source code tree.
This command is intended to be rerun infrequently and always manually.
"""
NewEnglandSkiHistoryTimelineScraper.scrape_all_seasons()

@staticmethod
@cli.command(name="analyze") # type: ignore [misc]
def analyze(
Expand Down
Empty file added openskistats/nesh/__init__.py
Empty file.
132 changes: 132 additions & 0 deletions openskistats/nesh/timelines.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@
"""
Extract ski area opening and closing dates from New England Ski History Timelines.
For more information on New England Ski History, see
- https://www.newenglandskihistory.com
- https://skinewengland.net/
- https://skinewenglandnet.substack.com/
- https://github.com/dhimmel/openskistats/issues/23
"""

import calendar
import json
import logging
import time
from dataclasses import dataclass
from datetime import date
from functools import lru_cache
from pathlib import Path
from typing import Any, ClassVar, Literal

import requests
from bs4 import BeautifulSoup

from openskistats.utils import get_request_headers


@dataclass(frozen=True)
class NewEnglandSkiHistoryTimelineScraper:
season: int
moment: Literal["opening", "closing"]
NESH_URL: ClassVar[str] = "https://www.newenglandskihistory.com"
JSON_PATH: ClassVar[Path] = Path(__file__).parent.joinpath(
"new_england_ski_history_timelines.json"
)

@property
def season_str(self) -> str:
"""Returns a string representation of the season like '2024-25'."""
return f"{self.season}-{str(self.season + 1)[-2:]}"

@lru_cache(maxsize=500) # noqa: B019
def get_response_text(self) -> str:
"""Get the HTML content of the page."""
url = f"{self.NESH_URL}/timeline/{self.moment}dates.php"
time.sleep(1)
response = requests.get(
url=url, params={"season": self.season_str}, headers=get_request_headers()
)
logging.info(
f"Request to {response.url} returned status code {response.status_code}."
)
response.raise_for_status()
assert isinstance(response.text, str)
return response.text

def extract_ski_area_dates(self) -> list[dict[str, Any]]:
soup = BeautifulSoup(markup=self.get_response_text(), features="html.parser")
# find all state tables
state_tables = soup.find_all("table", {"bgcolor": "#DCDCDC", "width": "100%"})
opening_days = []
for table in state_tables:
header_row, *rows = table.find_all("tr")
state_name = header_row.find("b").text
for row in rows:
ski_area_cell, date_cell = row.find_all("td")
date_raw = date_cell.get_text(strip=True)
ski_area_url = None
if link_elem := ski_area_cell.find("a"):
ski_area_url = f"{self.NESH_URL}{link_elem.get('href')}"
opening_days.append(
{
"season": self.season,
"season_str": self.season_str,
"moment": self.moment,
"state": state_name,
"ski_area_name": ski_area_cell.get_text(strip=True),
"ski_area_page": ski_area_url,
"date_raw": date_raw,
"date_iso": self.parse_raw_date(date_raw),
}
)
return opening_days

def parse_raw_date(self, date_raw: str) -> str:
"""
Parse the raw date string into an ISO 8601 date string.
"""
month_abbr, day = date_raw.split()
month_index = list(calendar.month_abbr).index(month_abbr)
return date(
year=self.season + self.get_season_year_offset(month_abbr),
month=month_index,
day=int(day),
).isoformat()

@staticmethod
def get_season_year_offset(month_abbr: str) -> int:
"""
Offset from the starting year of the ski season for the Northern Hemisphere.
"""
return {
"Jan": 1,
"Feb": 1,
"Mar": 1,
"Apr": 1,
"May": 1,
"Jun": 1,
"Jul": 1,
"Aug": 1,
"Sep": 0,
"Oct": 0,
"Nov": 0,
"Dec": 0,
}[month_abbr]

@staticmethod
def get_all_seasons(starting_year: int = 1936) -> list[int]:
current_year = date.today().year
return list(range(starting_year, current_year + 1))

@classmethod
def scrape_all_seasons(cls) -> list[dict[str, Any]]:
"""Get all seasons and moments."""
rows = []
for season in cls.get_all_seasons():
for moment in ["opening", "closing"]:
scraper = cls(season=season, moment=moment) # type: ignore [arg-type]
rows.extend(scraper.extract_ski_area_dates())
rows.sort(key=lambda x: (x["ski_area_name"], x["date_iso"]))
json_str = json.dumps(rows, indent=2, ensure_ascii=False)
cls.JSON_PATH.write_text(json_str + "\n")
return rows
11 changes: 7 additions & 4 deletions openskistats/openskimap_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,12 @@
import requests

from openskistats.models import OpenSkiMapStatus, RunCoordinateModel, SkiRunUsage
from openskistats.utils import get_data_directory, get_repo_directory, running_in_test
from openskistats.utils import (
get_data_directory,
get_repo_directory,
get_request_headers,
running_in_test,
)
from openskistats.variables import set_variables


Expand Down Expand Up @@ -68,9 +73,7 @@ def download_openskimap_geojson(
path = get_openskimap_path(name)
path.parent.mkdir(exist_ok=True)
logging.info(f"Downloading {url} to {path}")
headers = {
"From": "https://github.com/dhimmel/openskistats",
}
headers = get_request_headers()
response = requests.get(url, allow_redirects=True, headers=headers)
response.raise_for_status()
with lzma.open(path, "wb") as write_file:
Expand Down
6 changes: 6 additions & 0 deletions openskistats/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,3 +125,9 @@ def oxford_join(
if len(head) == 1:
return f"{head[0]} {final_sep_extra}{final}"
return f"{sep.join(head)}{sep}{final_sep_extra}{final}"


def get_request_headers() -> dict[str, str]:
return {
"From": "https://github.com/dhimmel/openskistats",
}
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ license = {file = "LICENSE.md"}
repository = "https://github.com/dhimmel/openskistats"
requires-python = ">= 3.12"
dependencies = [
"beautifulsoup4>=4.12.3",
"folium>=0.19.3",
"great-tables >= 0.13.0", # https://github.com/machow/reactable-py/issues/29
"jupyter >= 1.1.1",
Expand Down
2 changes: 2 additions & 0 deletions uv.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit fe157d3

Please sign in to comment.