Merge pull request #61 from chihacknight/automate-schedule-downloads

Automate schedule downloads
chihacknight · Sep 20, 2023 · 0a73534 · 0a73534
2 parents e3830ac + 4c06991
commit 0a73534
Show file tree

Hide file tree

Showing 4 changed files with 235 additions and 20 deletions.
diff --git a/.github/workflows/cta_data_downloads.yml b/.github/workflows/cta_data_downloads.yml
@@ -0,0 +1,68 @@
+name: Automate CTA schedule and realtime downloads
+
+on:
+
+  schedule:
+    # Run every day at 12:30pm CST which is 5:30pm UTC
+    - cron: 30 17 * * * 
+
+env: 
+  PYTHON_VERSION: 3.10.6
+  AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
+  AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
+
+jobs:
+  download-cta-schedule-data:
+    runs-on: ubuntu-latest
+
+    steps:
+      - uses: actions/checkout@v3
+
+      - uses: actions/setup-python@v4
+        with:
+          python-version: ${{ env.PYTHON_VERSION }}
+
+      - name: Download and save CTA schedule data
+
+        run: |
+          pip install -r requirements.txt
+          python -c 'from scrape_data.cta_data_downloads import save_cta_zip; \
+           save_cta_zip()' \
+           $AWS_ACCESS_KEY_ID $AWS_SECRET_ACCESS_KEY
+   
+
+  save-schedule-daily-summary:
+    runs-on: ubuntu-latest
+
+    steps:
+      - uses: actions/checkout@v3
+
+      - uses: actions/setup-python@v4
+        with:
+          python-version: ${{ env.PYTHON_VERSION }}
+
+      - name: 'Save schedule summaries'
+        run: |
+          pip install -r requirements.txt
+          python -c 'from scrape_data.cta_data_downloads import save_sched_daily_summary; \
+           save_sched_daily_summary()' $AWS_ACCESS_KEY_ID $AWS_SECRET_ACCESS_KEY
+    
+
+  save-realtime-daily-summary:
+    runs-on: ubuntu-latest
+
+    steps:
+      - uses: actions/checkout@v3
+
+      - uses: actions/setup-python@v4
+        with:
+          python-version: ${{ env.PYTHON_VERSION }}
+
+      - name: 'Save realtime summaries'
+
+        run: |
+          pip install -r requirements.txt
+          
+          python -c 'from scrape_data.cta_data_downloads import save_realtime_daily_summary; \
+           save_realtime_daily_summary()' $AWS_ACCESS_KEY_ID $AWS_SECRET_ACCESS_KEY
+    
diff --git a/data_analysis/static_gtfs_analysis.py b/data_analysis/static_gtfs_analysis.py
@@ -13,7 +13,7 @@
 import os
 from pathlib import Path
 from dataclasses import dataclass
-from typing import List
+from typing import Tuple
 
 import logging
 import calendar
@@ -38,7 +38,6 @@
     datefmt='%m/%d/%Y %I:%M:%S %p'
 )
 
-
 @dataclass
 class GTFSFeed:
     """Class for storing GTFSFeed data.
@@ -53,24 +52,32 @@ class GTFSFeed:
 
     @classmethod
     def extract_data(cls, gtfs_zipfile: zipfile.ZipFile,
-                     version_id: str = None) -> GTFSFeed:
+                     version_id: str = None, cta_download: bool = True) -> GTFSFeed:
         """Load each text file in zipfile into a DataFrame
 
         Args:
             gtfs_zipfile (zipfile.ZipFile): Zipfile downloaded from
-                CTA transit feeds e.g.
+                transitfeeds.com or transitchicago.com e.g.
                 https://transitfeeds.com/p/chicago-transit-authority/
-                165/20220718/download"
+                165/20220718/download or https://www.transitchicago.com/downloads/sch_data/
             version_id (str, optional): The schedule version in use.
                 Defaults to None.
 
         Returns:
             GTFSFeed: A GTFSFeed object containing multiple DataFrames
                 accessible by name.
         """
-        if version_id is None:
-            version_id = VERSION_ID
-        logging.info(f"Extracting data from CTA zipfile version {version_id}")
+        if cta_download:
+            if version_id is not None:
+                raise ValueError("version_id is not used for downloads directly from CTA")
+            else:
+                logging.info(f"Extracting data from transitchicago.com zipfile")
+
+        else:
+            if version_id is None:
+                version_id = VERSION_ID
+            logging.info(f"Extracting data from transitfeeds.com zipfile version {version_id}")
+
         data_dict = {}
         pbar = tqdm(cls.__annotations__.keys())
         for txt_file in pbar:
@@ -140,14 +147,16 @@ def format_dates_hours(data: GTFSFeed) -> GTFSFeed:
 
 def make_trip_summary(
     data: GTFSFeed,
-    feed_start_date: pendulum.datetime,
-        feed_end_date: pendulum.datetime) -> pd.DataFrame:
+    feed_start_date: pendulum.datetime = None,
+        feed_end_date: pendulum.datetime = None) -> pd.DataFrame:
     """Create a summary of trips with one row per date
 
     Args:
         data (GTFSFeed): GTFS data from CTA
-        feed_start_date (datetime): Date from which this feed is valid (inclusive)
-        feed_end_date (datetime): Date until which this feed is valid (inclusive)
+        feed_start_date (datetime): Date from which this feed is valid (inclusive).
+            Defaults to None
+        feed_end_date (datetime): Date until which this feed is valid (inclusive).
+            Defaults to None
 
     Returns:
         pd.DataFrame: A DataFrame with each trip that occurred per row.
@@ -161,7 +170,7 @@ def make_trip_summary(
         ),
         columns=["raw_date"],
     )
-
+     
     # cross join calendar index with actual calendar to get all combos of
     # possible dates & services
     calendar_cross = calendar_date_range.merge(data.calendar, how="cross")
@@ -244,9 +253,10 @@ def make_trip_summary(
         trip_stop_hours, how="left", on="trip_id")
 
     # filter to only the rows for the period where this specific feed version was in effect
-    trip_summary = trip_summary.loc[
-        (trip_summary['raw_date'] >= feed_start_date)
-        & (trip_summary['raw_date'] <= feed_end_date), :]
+    if feed_start_date is not None and feed_end_date is not None:
+        trip_summary = trip_summary.loc[
+            (trip_summary['raw_date'] >= feed_start_date)
+            & (trip_summary['raw_date'] <= feed_end_date), :]
 
     return trip_summary
 
@@ -321,6 +331,23 @@ def make_linestring_of_points(
     return shapely.geometry.LineString(list(sorted_df["pt"]))
 
 
+def download_cta_zip() -> Tuple[zipfile.ZipFile, BytesIO]:
+    """Download CTA schedule data from transitchicago.com
+
+    Returns:
+        zipfile.ZipFile: A zipfile of the latest GTFS schedule data from transitchicago.com
+    """
+    logger.info('Downloading CTA data')
+    zip_bytes_io = BytesIO(
+            requests.get("https://www.transitchicago.com/downloads/sch_data/google_transit.zip"
+            ).content
+        )
+    CTA_GTFS = zipfile.ZipFile(zip_bytes_io)
+    logging.info('Download complete')
+    return CTA_GTFS, zip_bytes_io
+
+
+
 def download_zip(version_id: str) -> zipfile.ZipFile:
     """Download a version schedule from transitfeeds.com
 
@@ -344,17 +371,22 @@ def download_zip(version_id: str) -> zipfile.ZipFile:
     return CTA_GTFS
 
 
-def download_extract_format(version_id: str) -> GTFSFeed:
+def download_extract_format(version_id: str = None) -> GTFSFeed:
     """Download a zipfile of GTFS data for a given version_id,
         extract data, and format date column.
 
     Args:
-        version_id (str): The version of the GTFS schedule data to download
+        version_id (str): The version of the GTFS schedule data to download. Defaults to None
+            If version_id is None, data will be downloaded from the CTA directly (transitchicag.com)
+            instead of transitfeeds.com
 
     Returns:
         GTFSFeed: A GTFSFeed object with formated dates
     """
-    CTA_GTFS = download_zip(version_id)
+    if version_id is None:
+        CTA_GTFS, _ = download_cta_zip()
+    else:
+        CTA_GTFS = download_zip(version_id)
     data = GTFSFeed.extract_data(CTA_GTFS, version_id=version_id)
     data = format_dates_hours(data)
     return data

diff --git a/requirements.txt b/requirements.txt
@@ -9,7 +9,7 @@ python-dotenv==0.20.0
 seaborn==0.12.0
 PyQt5==5.15.7
 folium==0.12.1.post1
-mapclassify==2.4.2+55.g0155c6e
+mapclassify>=2.4.2+55.g0155c6e
 plotly==5.11.0
 kaleido==0.2.1
 pre-commit==2.20.0
diff --git a/scrape_data/cta_data_downloads.py b/scrape_data/cta_data_downloads.py
@@ -0,0 +1,115 @@
+import boto3
+import sys
+import data_analysis.static_gtfs_analysis as sga
+import data_analysis.compare_scheduled_and_rt as csrt
+import pendulum
+from io import StringIO
+import pandas as pd
+
+
+ACCESS_KEY = sys.argv[1]
+SECRET_KEY = sys.argv[2]
+
+client = boto3.client(
+    's3',
+    aws_access_key_id=ACCESS_KEY,
+    aws_secret_access_key=SECRET_KEY
+)
+
+s3 = boto3.resource(
+    's3',
+    region_name='us-east-1',
+    aws_access_key_id=ACCESS_KEY,
+    aws_secret_access_key=SECRET_KEY
+)
+
+today = pendulum.now('America/Chicago').to_date_string()
+
+CTA_GTFS, zipfile_bytes_io = sga.download_cta_zip()
+
+def save_cta_zip() -> None:
+    print(f'Saving zipfile available at '
+        f'https://www.transitchicago.com/downloads/sch_data/google_transit.zip '
+        f'on {today} to public bucket')
+    filename = f'cta_schedule_zipfiles_raw/google_transit_{today}.zip'
+    zipfile_bytes_io.seek(0)
+    client.upload_fileobj(
+        zipfile_bytes_io,
+        csrt.BUCKET_PUBLIC,
+        filename
+    )
+    print(f'Confirm that {filename} exists in bucket')
+    keys('chn-ghost-buses-public', [filename])
+
+
+def save_csv_to_bucket(df: pd.DataFrame, filename: str) -> None:
+    """Save pandas DataFrame to csv in s3
+
+    Args:
+        df (pd.DataFrame): DataFrame to be saved
+        filename (str): Name of the saved filename in s3.
+            Should contain the .csv suffix.
+    """
+    csv_buffer = StringIO()
+    df.to_csv(csv_buffer)
+
+    print(f'Saving {filename} to public bucket')
+    s3.Object(
+        csrt.BUCKET_PUBLIC,
+        f'{filename}')\
+        .put(Body=csv_buffer.getvalue())
+
+
+def save_sched_daily_summary() -> None:
+    data = sga.GTFSFeed.extract_data(CTA_GTFS)
+    data = sga.format_dates_hours(data)
+    trip_summary = sga.make_trip_summary(data)
+
+    route_daily_summary = (
+        sga.summarize_date_rt(trip_summary)
+    )
+    route_daily_summary['date'] = route_daily_summary['date'].astype(str)
+    route_daily_summary_today = route_daily_summary.loc[route_daily_summary['date'] == today]
+
+    print(f'Saving cta_route_daily_summary_{today}.csv to public bucket')
+    filename = f'schedule_summaries/daily_job/cta_route_daily_summary_{today}.csv'
+    save_csv_to_bucket(
+        route_daily_summary_today,
+        filename=filename
+    )
+    print(f'Confirm that {filename} exists in bucket')
+    keys(csrt.BUCKET_PUBLIC, [filename])
+
+
+def save_realtime_daily_summary() -> None:
+    if pendulum.now("America/Chicago").hour >= 11:
+        end_date = pendulum.yesterday("America/Chicago")
+    else: 
+        end_date = pendulum.now("America/Chicago").subtract(days=2)
+
+    end_date = end_date.to_date_string()
+
+    daily_data = pd.read_csv(
+                (csrt.BASE_PATH / f"bus_full_day_data_v2/{end_date}.csv")
+                .as_uri(),
+                low_memory=False
+            )
+
+    daily_data = csrt.make_daily_summary(daily_data)
+    filename = f'realtime_summaries/daily_job/bus_full_day_data_v2/{end_date}.csv'
+    save_csv_to_bucket(daily_data, filename=filename)
+
+    print(f'Confirm that {filename} exists in bucket')
+    keys(csrt.BUCKET_PUBLIC, [filename])
+
+# https://stackoverflow.com/questions/30249069/listing-contents-of-a-bucket-with-boto3
+def keys(bucket_name: str, filenames: list,
+         prefix: str='/', delimiter: str='/',
+         start_after: str='') -> None:
+    s3_paginator = client.get_paginator('list_objects_v2')
+    prefix = prefix.lstrip(delimiter)
+    start_after = (start_after or prefix) if prefix.endswith(delimiter) else start_after
+    for page in s3_paginator.paginate(Bucket=bucket_name, Prefix=prefix, StartAfter=start_after):
+        for content in page.get('Contents', ()):
+            if content['Key'] in filenames:
+                print(f"{content['Key']} exists")