diff --git a/scraper/raw_inmate_data.py b/scraper/raw_inmate_data.py index 44fe3a5..66082b7 100644 --- a/scraper/raw_inmate_data.py +++ b/scraper/raw_inmate_data.py @@ -1,8 +1,12 @@ -import os.path -import csv +from os import path +from pyquery import PyQuery from collections import OrderedDict -import shutil +from itertools import chain +import csv, shutil, requests + +RAW_INMATE_DATA_RELEASE_URL = 'http://cookcountyjail.recoveredfactory.net/raw_inmate_data/' +RAW_INMATE_DATA_STARTING_YEAR = '2014' RAW_INMATE_DATA_BUILD_DIR = 'CCJ_RAW_INMATE_DATA_BUILD_DIR' RAW_INMATE_DATA_RELEASE_DIR = 'CCJ_RAW_INMATE_DATA_RELEASE_DIR' @@ -30,6 +34,7 @@ class RawInmateData: ('Court_Location', 'court_house_location') ]) + def __init__(self, snap_shot_date, feature_controls, monitor): if feature_controls is None: feature_controls = {} @@ -45,7 +50,45 @@ def __init__(self, snap_shot_date, feature_controls, monitor): self.__feature_activated = False self.__configure_feature(feature_controls) + + @staticmethod + def available_dates(): + """ Return a list of dates for which there is csv data available. + The dates are in text format, as follows: YYYY-MM-DD. """ + year_to_try = RAW_INMATE_DATA_STARTING_YEAR + result = True + dates = [] + while result: + result = RawInmateData._available_dates_for_year(year_to_try) + year_to_try = str(int(year_to_try) + 1) + if result: + dates.extend(result) + + return dates + + @staticmethod + def _available_dates_for_year(year): + """ Given a year, query the raw inmate data API, and return + a list of dates for which there is csv data available there. + The dates are in text format, as follows: YYYY-MM-DD. If + there is no data for the year, returns None. """ + try: + result = requests.get(RAW_INMATE_DATA_RELEASE_URL + year) + except requests.RequestException: + return None + + if result.status_code != requests.codes.ok: + return None + + doc = PyQuery(result.content) + # get a list of links from the directory page + # ignore the first link, which points to the dir above + dates = doc('a:not(:first-child)') + # drop the '.csv' + return [d.text_content()[:-4] for d in dates] + def add(self, inmate_details): + """ Add an inmate record to the account of raw csv data. """ if not self.__feature_activated: return if self.__build_file_writer is None: @@ -70,11 +113,11 @@ def __debug(self, msg, debug_level=None): self.__monitor.debug('{0}: {1}'.format(self.__klass_name, msg), debug_level) def __ensure_year_dir(self): - year_dir = os.path.join(self.__raw_inmate_dir, self.__snap_shot_date.strftime('%Y')) + year_dir = path.join(self.__raw_inmate_dir, self.__snap_shot_date.strftime('%Y')) try: os.makedirs(year_dir) except OSError: - if not os.path.isdir(year_dir): + if not path.isdir(year_dir): raise return year_dir @@ -82,15 +125,36 @@ def __feature_control(self, feature_controls, feature_control): okay, dir_name = False, None if feature_control in feature_controls: dir_name = feature_controls[feature_control] - okay = os.path.isdir(dir_name) + okay = path.isdir(dir_name) if not okay: self.__debug("'%s' does not exist or is not a directory" % dir_name) return okay, dir_name + @staticmethod + def fetch_data_for_date(date): + """ Return the raw inmate data for the supplied date, in YYYY-MM-DD format. + If the data can't be fetched, for whatever reason, returns None. """ + if date not in RawInmateData.available_dates(): + return None + + chosen_year = date[:4] + query_url = RAW_INMATE_DATA_RELEASE_URL + chosen_year + '/' + date + '.csv' + try: + result = requests.get(query_url) + except requests.RequestException: + return None + + if result.status_code != requests.codes.ok: + return None + + return result.content + def __file_name(self): return self.__snap_shot_date.strftime('%Y-%m-%d.csv') def finish(self): + """ Method to be called when the build file containing the raw inmate data + is completed. Moves that file into the release directory. """ if not self.__feature_activated: return self.__build_file.close() @@ -98,8 +162,12 @@ def finish(self): shutil.move(self.__build_file_name, year_dir) def __open_build_file(self): - self.__build_file_name = os.path.join(self.__build_dir, self.__file_name()) + self.__build_file_name = path.join(self.__build_dir, self.__file_name()) self.__build_file = open(self.__build_file_name, "w") self.__build_file_writer = csv.writer(self.__build_file) header_names = [header_name for header_name in RawInmateData.HEADER_METHOD_NAMES.iterkeys()] self.__build_file_writer.writerow(header_names) + + + +