Skip to content

Commit

Permalink
Management command for exporting CSV
Browse files Browse the repository at this point in the history
Create a class, ``AddressAnonymizer`` for anonymizing the addresses.

This leverages datamade's usaddress package, but we have to use
a fork that I created that adds in really basic support for Python 3.

Create a management command ``export_csv`` to output CSV of the
anonymized disposition fields to standard output.

Addresses sc3/cook-convictions#110
  • Loading branch information
ghing committed Oct 27, 2014
1 parent aeb670a commit fbea4af
Show file tree
Hide file tree
Showing 7 changed files with 143 additions and 1 deletion.
40 changes: 40 additions & 0 deletions convictions_data/address.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
import usaddress

class AddressAnonymizer(object):
"""Anonymize addresses to the 100 block"""
skip_component_types = [
'AddressNumberSuffix',
'OccupancyIdentifier',
'OccupancyType',
]
"""Don't include address components of these types in the anonymized output"""

def __init__(self):
self._cache = {}

def anonymize(self, address):
try:
return self._cache[address]
except KeyError:
parsed = usaddress.parse(address)
anonymized = ' '.join(self._anonymize_parsed(parsed))
self._cache[address] = anonymized
return anonymized

def _anonymize_parsed(self, components):
anonymized = []
for c, c_type in components:
if c_type == 'AddressNumber':
c = self._anonymize_address_number(c)
elif c_type in self.skip_component_types:
continue

anonymized.append(c)

return anonymized

def _anonymize_address_number(self, n):
if len(n) <= 2:
return '0'
else:
return n[0:-2] + '00'
15 changes: 15 additions & 0 deletions convictions_data/management/commands/export_csv.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
import csv

from django.core.management.base import BaseCommand

from convictions_data.models import Disposition

class Command(BaseCommand):
help = ("Export disposition records to CSV removing personal information.")

def handle(self, *args, **options):
writer = csv.DictWriter(self.stdout,
fieldnames=Disposition.objects.EXPORT_FIELDS)

for disp in Disposition.objects.anonymized_values():
writer.writerow(disp)
7 changes: 7 additions & 0 deletions convictions_data/manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,13 @@ def has_geocodable_address(self):
def in_analysis(self):
return self.get_query_set().in_analysis()

def anonymized_values(self):
return self.get_query_set().anonymized_values()

@property
def EXPORT_FIELDS(self):
return self.get_query_set().EXPORT_FIELDS


class ConvictionGeoManager(geo_models.GeoManager):
def get_queryset(self):
Expand Down
1 change: 1 addition & 0 deletions convictions_data/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -194,6 +194,7 @@ class Disposition(models.Model):
# Use a custom manager to add geocoding methods
objects = DispositionManager()


def __init__(self, *args, **kwargs):
super(Disposition, self).__init__(*args, **kwargs)
if self.pk is None:
Expand Down
48 changes: 48 additions & 0 deletions convictions_data/query/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@

from djgeojson.serializers import Serializer as GeoJSONSerializer

from convictions_data.address import AddressAnonymizer
from convictions_data.geocoders import BatchOpenMapQuest
from convictions_data.signals import (pre_geocode_page, post_geocode_page)

Expand All @@ -33,6 +34,46 @@
class DispositionQuerySet(SexQuerySetMixin, AgeQuerySetMixin, DrugQuerySetMixin, QuerySet):
"""Custom QuerySet that adds bulk geocoding capabilities"""

EXPORT_FIELDS = [
'case_number',
'sequence_number',
'st_address',
'city',
'state',
'zipcode',
'arrest_date',
'initial_date',
'sex',
'statute',
'chrgdesc',
'chrgtype',
'chrgtype2',
'chrgclass',
'chrgdisp',
'chrgdispdate',
'ammndchargstatute',
'ammndchrgdescr',
'ammndchrgtype',
'ammndchrgclass',
'minsent_years',
'minsent_months',
'minsent_days',
'minsent_life',
'minsent_death',
'maxsent_years',
'maxsent_months',
'maxsent_days',
'maxsent_life',
'maxsent_death',
'amtoffine',
]
"""
Fields to include in a CSV export of these records
In particular, we exclude personally identifying information like
``ctlbkngno``, ``fgrprntno`` and ``dob``.
"""

def geocode(self, batch_size=100, timeout=1):
geocoder = BatchOpenMapQuest(
api_key=settings.CONVICTIONS_GEOCODER_API_KEY,
Expand Down Expand Up @@ -264,6 +305,13 @@ def create_convictions(self):

return convictions

def anonymized_values(self):
vals = self.values(*self.EXPORT_FIELDS)
anonymizer = AddressAnonymizer()
for d in vals:
d['st_address'] = anonymizer.anonymize(d['st_address'])
yield d


class ConvictionQuerySet(SexQuerySetMixin, AgeQuerySetMixin, DrugQuerySetMixin, QuerySet):
"""
Expand Down
30 changes: 29 additions & 1 deletion convictions_data/tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,11 @@
from django.conf import settings
from django.test import SimpleTestCase, TestCase, TransactionTestCase

from convictions_data import statute
from convictions_data.address import AddressAnonymizer
from convictions_data.cleaner import CityStateCleaner, CityStateSplitter
from convictions_data.geocoders import BatchOpenMapQuest
from convictions_data.models import Disposition, RawDisposition
from convictions_data import statute

try:
from django.test.runner import DiscoverRunner as BaseRunner
Expand Down Expand Up @@ -287,3 +288,30 @@ def test_get_iucr(self):
for s, iucr_code in test_values:
iucr_offense = statute.get_iucr(s)[0]
self.assertEqual(iucr_offense.code, iucr_code)


class AddressAnonymizerTestCase(SimpleTestCase):
def setUp(self):
self.anonymizer = AddressAnonymizer()

def test_anonymize(self):
# These address formats are from the data but I changed the street names
# to made up ones.
test_data = [
('7719 1/2 N LINDA', '7700 N LINDA'),
('1505 S KERNEY 1ST FL', '1500 S KERNEY'),
('523 W 999TH ST 2ND FL', '500 W 999TH ST'),
('5920 N HILL #220', '5900 N HILL'),
]

for address, expected in test_data:
anonymized = self.anonymizer.anonymize(address)
self.assertEqual(anonymized, expected)

def test_anonymize_address_number(self):
test_data = [
('7719', '7700'),
('523', '500'),
('100', '100'),
('50', '0'),
]
3 changes: 3 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,6 @@ geopy>=0.99
South>=0.8,<1.0
django-geojson==2.6.0
django-model-utils==2.2
# Use my fork of usaddress for Python 3 support
# This is an expedient hack and I'm sorry
-e git+https://github.com/ghing/usaddress.git@python3_support#egg=usaddress

0 comments on commit fbea4af

Please sign in to comment.