Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fixes issue #443; add 'available_dates' and 'fetch_data_for_date' comman... #451

Open
wants to merge 5 commits into
base: master
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
82 changes: 75 additions & 7 deletions scraper/raw_inmate_data.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,12 @@

import os.path
import csv
from os import path
from pyquery import PyQuery
from collections import OrderedDict
import shutil
from itertools import chain
import csv, shutil, requests

RAW_INMATE_DATA_RELEASE_URL = 'http://cookcountyjail.recoveredfactory.net/raw_inmate_data/'
RAW_INMATE_DATA_STARTING_YEAR = '2014'

RAW_INMATE_DATA_BUILD_DIR = 'CCJ_RAW_INMATE_DATA_BUILD_DIR'
RAW_INMATE_DATA_RELEASE_DIR = 'CCJ_RAW_INMATE_DATA_RELEASE_DIR'
Expand Down Expand Up @@ -30,6 +34,7 @@ class RawInmateData:
('Court_Location', 'court_house_location')
])


def __init__(self, snap_shot_date, feature_controls, monitor):
if feature_controls is None:
feature_controls = {}
Expand All @@ -45,7 +50,45 @@ def __init__(self, snap_shot_date, feature_controls, monitor):
self.__feature_activated = False
self.__configure_feature(feature_controls)


@staticmethod
def available_dates():
""" Return a list of dates for which there is csv data available.
The dates are in text format, as follows: YYYY-MM-DD. """
year_to_try = RAW_INMATE_DATA_STARTING_YEAR
result = True
dates = []
while result:
result = RawInmateData._available_dates_for_year(year_to_try)
year_to_try = str(int(year_to_try) + 1)
if result:
dates.extend(result)

return dates

@staticmethod
def _available_dates_for_year(year):
""" Given a year, query the raw inmate data API, and return
a list of dates for which there is csv data available there.
The dates are in text format, as follows: YYYY-MM-DD. If
there is no data for the year, returns None. """
try:
result = requests.get(RAW_INMATE_DATA_RELEASE_URL + year)
except requests.RequestException:
return None

if result.status_code != requests.codes.ok:
return None

doc = PyQuery(result.content)
# get a list of links from the directory page
# ignore the first link, which points to the dir above
dates = doc('a:not(:first-child)')
# drop the '.csv'
return [d.text_content()[:-4] for d in dates]

def add(self, inmate_details):
""" Add an inmate record to the account of raw csv data. """
if not self.__feature_activated:
return
if self.__build_file_writer is None:
Expand All @@ -70,36 +113,61 @@ def __debug(self, msg, debug_level=None):
self.__monitor.debug('{0}: {1}'.format(self.__klass_name, msg), debug_level)

def __ensure_year_dir(self):
year_dir = os.path.join(self.__raw_inmate_dir, self.__snap_shot_date.strftime('%Y'))
year_dir = path.join(self.__raw_inmate_dir, self.__snap_shot_date.strftime('%Y'))
try:
os.makedirs(year_dir)
except OSError:
if not os.path.isdir(year_dir):
if not path.isdir(year_dir):
raise
return year_dir

def __feature_control(self, feature_controls, feature_control):
okay, dir_name = False, None
if feature_control in feature_controls:
dir_name = feature_controls[feature_control]
okay = os.path.isdir(dir_name)
okay = path.isdir(dir_name)
if not okay:
self.__debug("'%s' does not exist or is not a directory" % dir_name)
return okay, dir_name

@staticmethod
def fetch_data_for_date(date):
""" Return the raw inmate data for the supplied date, in YYYY-MM-DD format.
If the data can't be fetched, for whatever reason, returns None. """
if date not in RawInmateData.available_dates():
return None

chosen_year = date[:4]
query_url = RAW_INMATE_DATA_RELEASE_URL + chosen_year + '/' + date + '.csv'
try:
result = requests.get(query_url)
except requests.RequestException:
return None

if result.status_code != requests.codes.ok:
return None

return result.content

def __file_name(self):
return self.__snap_shot_date.strftime('%Y-%m-%d.csv')

def finish(self):
""" Method to be called when the build file containing the raw inmate data
is completed. Moves that file into the release directory. """
if not self.__feature_activated:
return
self.__build_file.close()
year_dir = self.__ensure_year_dir()
shutil.move(self.__build_file_name, year_dir)

def __open_build_file(self):
self.__build_file_name = os.path.join(self.__build_dir, self.__file_name())
self.__build_file_name = path.join(self.__build_dir, self.__file_name())
self.__build_file = open(self.__build_file_name, "w")
self.__build_file_writer = csv.writer(self.__build_file)
header_names = [header_name for header_name in RawInmateData.HEADER_METHOD_NAMES.iterkeys()]
self.__build_file_writer.writerow(header_names)