Skip to content

Commit

Permalink
Clean up city/state splitting and cleaning
Browse files Browse the repository at this point in the history
  • Loading branch information
ghing committed Jun 26, 2014
1 parent 2af92d7 commit f57b45b
Show file tree
Hide file tree
Showing 4 changed files with 106 additions and 70 deletions.
1 change: 1 addition & 0 deletions convictions_data/fixtures/test_municipalities.json

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions convictions_data/geocoders.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ def batch_geocode(self, queries, exactly_one=True, timeout=None):
self.scheme)
# The key is already urlencoded, so just append it at the end
url = "&".join((url, urlencode(params), "key={}".format(self.api_key)))
print(url)
data = self._call_geocoder(url, timeout=timeout)
return self._batch_parse_json(data['results'], exactly_one)

Expand Down
101 changes: 53 additions & 48 deletions convictions_data/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,10 +23,17 @@
"CNTRY": "COUNTRY",
"HL": "HILLS",
"HGTS": "HEIGHTS",
"HTS": "HEIGHTS",
"PK": "PARK",
"VILL": "VILLAGE",
"CTY": "CITY",
}

ZIPCODE_RE = re.compile(r'^\d{5}$')

# Strings that represent states but are not official abbreviations
MOCK_STATES = set(['ILL', 'I', 'MX'])

class ConvictionsQuerySet(models.query.QuerySet):
"""Custom QuerySet that adds bulk geocoding capabilities"""

Expand Down Expand Up @@ -145,7 +152,7 @@ class Conviction(models.Model):
objects = ConvictionManager()

PUNCTUATION_RE = re.compile(r'[,.]+')
CHICAGO_RE = re.compile(r'^CH[I]{0,1}C{0,1}A{0,1}GO$')
CHICAGO_RE = re.compile(r'^CH[I]{0,1}C{0,1}A{0,1}GO{0,1}$')

def __init__(self, *args, **kwargs):
super(Conviction, self).__init__(*args, **kwargs)
Expand Down Expand Up @@ -204,40 +211,43 @@ def load_from_raw(self):
self.city, self.state = self._parse_city_state(self.raw_conviction.city_state)

return self

@classmethod
def _parse_city_state(cls, city_state):
"""Parse a combined city/state field into city and state parts"""
city, state = cls._split_city_state(city_state)
return cls._clean_city_state(city, state)

@classmethod
def _split_city_state(cls, city_state):
city_state = cls.PUNCTUATION_RE.sub(' ', city_state)
bits = re.split(r'\s+', city_state.strip())
if len(bits) == 1:
city = bits[0]
state = ''
elif len(bits) > 1 and len(bits) <= 4:
city = ' '.join([cls._unabbreviate_city_bit(s.strip())
for s in bits[:-1]])
state = bits[-1]
if us.states.lookup(state) is None:
if state.endswith("IL"):
# IL is concatenated with the second bit of the city/state
# Set the state to "IL" and add the rest of the city to the
# city string.
city = "{} {}".format(city, state.strip("IL"))
state = "IL"
else:
# Assume there's no state and just append the second bit to
# the city
city += " " + state
logger.warning("Unable to parse state from '{}'".format(city_state))
state = ""

last = bits[-1]

if us.states.lookup(last) or last in MOCK_STATES:
state = last
city_bits = bits[:-1]
elif len(last) >= 2 and (us.states.lookup(last[-2:]) or
last[-2:] in MOCK_STATES):
state = last[-2:]
city_bits = bits[:-1] + [last[:-2]]
else:
raise Exception("Unexpected number of bits for '{}'".format(city_state))
state = ""
city_bits = bits

city = cls._clean_city(city, state)
state = cls._clean_state(city, state)
return " ".join(city_bits), state

@classmethod
def _clean_city_state(cls, city, state):
clean_city = ' '.join([cls._fix_chicago(cls._unabbreviate_city_bit(s))
for s in city.split(' ')])

if state == "ILL":
clean_state = "IL"
else:
clean_state = state

return city, state
return clean_city, clean_state

@classmethod
def _unabbreviate_city_bit(cls, s):
Expand All @@ -246,6 +256,13 @@ def _unabbreviate_city_bit(cls, s):
except KeyError:
return s

@classmethod
def _fix_chicago(cls, s):
if cls.CHICAGO_RE.match(s):
return "CHICAGO"
else:
return s

@classmethod
def _parse_zipcode(cls, zipcode):
zipcode = zipcode.strip()
Expand All @@ -256,28 +273,16 @@ def _parse_zipcode(cls, zipcode):
return zipcode

@classmethod
def _clean_city(cls, city, state):
if cls.CHICAGO_RE.match(city):
city = "CHICAGO"

return city

@classmethod
def _clean_state(cls, city, state):
state = state.upper()
def _detect_state(cls, city):
# Check and see if the city name
# matches the name of a municipality in Cook County. If it does,
# set the state to IL.
q = Q(municipality_name__iexact=city) | Q(agency_name__iexact=city)

if Municipality.objects.filter(q).count():
return "IL"

if state == "ILL":
state = "IL"
elif state == "":
# No state has been specified. Check and see if the city name
# matches the name of a municipality in Cook County. If it does,
# set the state to IL.
q = Q(municipality_name__iexact=city) | Q(agency_name__iexact=city)
municipalities = Municipality.objects.filter(q)
if municipalities.count():
state = "IL"

return state
return ""

@classmethod
def _parse_dob(cls, dob):
Expand Down
73 changes: 51 additions & 22 deletions convictions_data/tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,37 +7,54 @@
from convictions_data.models import Conviction, RawConviction

class ConvictionModelTestCase(TestCase):
def test_parse_city_state(self):
def test_split_city_state(self):
test_values = [
("EVANSTON ILL.", "EVANSTON", "IL"),
("CHICAGO,ILL.", "CHICAGO", "IL"),
("EVANSTON ILL.", "EVANSTON", "ILL"),
("CHICAGO,ILL.", "CHICAGO", "ILL"),
("PALOS HILLS", "PALOS HILLS", ""),
("CALUMET CITYIL", "CALUMET CITY", "IL"),
("CNTRY CLB HL IL", "COUNTRY CLUB HILLS", "IL"),
("CHGO HGTS IL", "CHICAGO HEIGHTS", "IL"),
("CNTRY CLB HL IL", "CNTRY CLB HL", "IL"),
("CHGO HGTS IL", "CHGO HGTS", "IL"),
("MELROSE PK", "MELROSE PK", ""),
("EAST CHICAGOIN", "EAST CHICAGO", "IN"),
("BLOOMINGDALEIN", "BLOOMINGDALE", "IN"),
("MICHIGAN CTYIN", "MICHIGAN CTY", "IN"),
]
for city_state, expected_city, expected_state in test_values:
city, state = Conviction._parse_city_state(city_state)
city, state = Conviction._split_city_state(city_state)
self.assertEqual(city, expected_city)
self.assertEqual(state, expected_state)

def test_clean_city(self):
def test_clean_city_state(self):
test_values = [
("CHGO", "CHICAGO"),
("EVANSTON", "ILL", "EVANSTON", "IL"),
("CNTRY CLB HL", "IL", "COUNTRY CLUB HILLS", "IL"),
("CHGO HGTS", "IL", "CHICAGO HEIGHTS", "IL"),
("MELROSE PK", "", "MELROSE PARK", ""),
("MICHIGAN CTY", "IN", "MICHIGAN CITY", "IN"),
]
for city, state, expected_city, expected_state in test_values:
clean_city, clean_state = Conviction._clean_city_state(city, state)
self.assertEqual(clean_city, expected_city)
self.assertEqual(clean_state, expected_state)

for raw_city, expected_city in test_values:
self.assertEqual(Conviction._clean_city(raw_city), expected_city)

def test_clean_state(self):
def test_parse_city_state(self):
test_values = [
("IL", "IL"),
("ILL", "IL"),
("EVANSTON ILL.", "EVANSTON", "IL"),
("CHICAGO,ILL.", "CHICAGO", "IL"),
("PALOS HILLS", "PALOS HILLS", ""),
("CALUMET CITYIL", "CALUMET CITY", "IL"),
("CNTRY CLB HL IL", "COUNTRY CLUB HILLS", "IL"),
("CHGO HGTS IL", "CHICAGO HEIGHTS", "IL"),
("MELROSE PK", "MELROSE PARK", ""),
("EAST CHICAGOIN", "EAST CHICAGO", "IN"),
("BLOOMINGDALEIN", "BLOOMINGDALE", "IN"),
("MICHIGAN CTYIN", "MICHIGAN CITY", "IN"),
]

for raw_state, expected_state in test_values:
self.assertEqual(Conviction._clean_state(raw_state),
expected_state)
for city_state, expected_city, expected_state in test_values:
city, state = Conviction._parse_city_state(city_state)
self.assertEqual(city, expected_city)
self.assertEqual(state, expected_state)

def test_parse_date(self):
test_values = [
Expand All @@ -55,15 +72,15 @@ def test_load_from_raw(self):
sequence_number="1",
st_address="707 W WAVELAND",
city_state="CHGO ILL",
zipcode="XXXXX",
zipcode="60622",
dob="19-Nov-43",
arrest_date="2-Jun-89"
)
conviction = Conviction(raw_conviction=raw)
conviction.load_from_raw()
self.assertEqual(conviction.case_number, raw.case_number)
self.assertEqual(conviction.sequence_number, raw.sequence_number)
self.assertEqual(conviction.st_address, raw.st_address)
self.assertEqual(conviction.address, raw.st_address)
self.assertEqual(conviction.city, "CHICAGO")
self.assertEqual(conviction.state, "IL")
self.assertEqual(conviction.zipcode, raw.zipcode)
Expand All @@ -79,21 +96,33 @@ def test_auto_load_from_raw(self):
sequence_number="1",
st_address="707 W WAVELAND",
city_state="CHGO ILL",
zipcode="XXXXX",
zipcode="60622",
dob="19-Nov-43",
arrest_date="2-Jun-89"
)
conviction = Conviction(raw_conviction=raw)
self.assertEqual(conviction.case_number, raw.case_number)
self.assertEqual(conviction.sequence_number, raw.sequence_number)
self.assertEqual(conviction.st_address, raw.st_address)
self.assertEqual(conviction.address, raw.st_address)
self.assertEqual(conviction.city, "CHICAGO")
self.assertEqual(conviction.state, "IL")
self.assertEqual(conviction.zipcode, raw.zipcode)
self.assertEqual(conviction.dob, datetime.date(1943, 11, 19))
self.assertEqual(conviction.arrest_date, datetime.date(1989, 6, 2))


class ConvictionsModelWithMunicipalitiesTestCase(TestCase):
fixtures = ['test_municipalities.json']

def test_detect_state(self):
test_values = [
("PALOS HILLS", "IL"),
]
for city, expected_state in test_values:
state = Conviction._detect_state(city)
self.assertEqual(state, expected_state)


class BatchOpenMapQuestTestCase(TestCase):
def setUp(self):
self.geocoder = BatchOpenMapQuest(
Expand Down

0 comments on commit f57b45b

Please sign in to comment.