Update loading of final statute and IUCR fields

Add final_statute_formatted field to the Disposition and Conviction model. Add a format_statute() function to format the parsed statute that will go into final_statute_formatted. Factor final_statute field population out into a method, load_final_statute. Remove trailing whitespace from files edited in this commit. Rename statute2iucr management command to load_final_statute_and_iucr to reflect these changes. Fix statute.strip_surrounding_parens to handle empty string Addresses sc3#7, sc3/cook-convictions#83
ghing · Nov 27, 2014 · fb89e3c · fb89e3c
1 parent f50182a
commit fb89e3c
Show file tree

Hide file tree

Showing 7 changed files with 396 additions and 124 deletions.
diff --git a/convictions_data/management/commands/load_final_statute_and_iucr.py b/convictions_data/management/commands/load_final_statute_and_iucr.py
@@ -0,0 +1,24 @@
+import logging
+
+from django.core.management.base import BaseCommand
+from django.db import transaction
+
+from convictions_data.models import Disposition
+from convictions_data.statute import (get_iucr, IUCRLookupError,
+    ILCSLookupError, StatuteFormatError)
+
+logger = logging.getLogger(__name__)
+
+class Command(BaseCommand):
+    help = ("Load the final statute, nicely formatted statute and IUCR code "
+            "from the statute or ammended statute fields")
+
+    def handle(self, *args, **options):
+        with transaction.atomic():
+            for disposition in Disposition.objects.all():
+                if disposition.ammndchargstatute:
+                    disposition.load_final_statute_and_iucr(disposition.ammndchargstatute)
+                    disposition.save()
+                elif disposition.statute:
+                    disposition.load_final_statute_and_iucr(disposition.statute)
+                    disposition.save()
diff --git a/convictions_data/management/commands/statute2iucr.py b/convictions_data/management/commands/statute2iucr.py
diff --git a/...migrations/0023_auto__add_field_conviction_final_statute_formatted__add_field_disposit.py b/...migrations/0023_auto__add_field_conviction_final_statute_formatted__add_field_disposit.py
diff --git a/convictions_data/models.py b/convictions_data/models.py
@@ -18,8 +18,9 @@
     CensusTractManager, CommunityAreaManager, DispositionManager)
 
 from convictions_data.query import ConvictionQuerySet
-from convictions_data.statute import (get_iucr, ILCSLookupError,
-        IUCRLookupError, StatuteFormatError)
+from convictions_data.statute import (get_iucr, parse_statute, format_statute,
+    MultipleMatchingILCSError, ILCSLookupError, IUCRLookupError,
+    StatuteFormatError)
 from convictions_data.signals import post_load_spatial_data
 
 
@@ -171,9 +172,15 @@ class Disposition(models.Model):
     amtoffine = models.IntegerField(null=True)
 
     final_statute = models.CharField(max_length=50, default="",
-        help_text="Field to make querying easier.  Set to the value of "
-        "ammndchargstatute if present, otherwise set to the value of statute",
+        help_text=("Field to make querying easier.  Set to the value of "
+                   "ammndchargstatute if present, otherwise set to the value "
+                   "of statute"),
         db_index=True)
+    final_statute_formatted = models.CharField(max_length=50, default="",
+        db_index=True,
+        help_text=("Value from final_statute but parsed and reformatted "
+                   "to try to normalize the formats and make grouping "
+                   "queries easier"))
     final_chrgdesc = models.CharField(max_length=50, default="", db_index=True)
     final_chrgtype = models.CharField(max_length=1, choices=CHRGTYPE_CHOICES,
         default="", db_index=True)
@@ -282,23 +289,7 @@ def _load_field_statute(self, val):
         self.statute = val
 
         if val:
-            self.final_statute = val
-
-            try:
-                offenses = get_iucr(val)
-                if len(offenses) == 1:
-                    self.iucr_code = offenses[0].code
-                    self.iucr_category = offenses[0].offense_category
-                else:
-                    logger.warn("Multiple matching IUCR offenses found for statute '{}'".format(val))
-            except IUCRLookupError as e:
-                logger.warn(e)
-            except ILCSLookupError as e:
-                logger.warn(e)
-            except AssertionError as e:
-                logger.warn(e)
-            except StatuteFormatError as e:
-                logger.warn(e)
+            self.load_final_statute_and_iucr(val)
 
         return self
 
@@ -333,23 +324,36 @@ def _load_field_ammndchargstatute(self, val):
         self.ammndchargstatute = val
 
         if val:
-            self.final_statute = val
+            self.load_final_statute_and_iucr(val)
 
-            try:
-                offenses = get_iucr(val)
-                if len(offenses) == 1:
-                    self.iucr_code = offenses[0].code
-                    self.iucr_category = offenses[0].offense_category
-                else:
-                    logger.warn("Multiple matching IUCR offenses found for statute '{}'".format(val))
-            except IUCRLookupError as e:
-                logger.warn(e)
-            except ILCSLookupError as e:
-                logger.warn(e)
-            except AssertionError as e:
-                logger.warn(e)
-            except StatuteFormatError as e:
-                logger.warn(e)
+        return self
+
+    def load_final_statute_and_iucr(self, val):
+        """Populate the final_statute, final_statute_formatted, iucr_code and
+        iucr_category fields from the value."""
+        self.final_statute = val
+
+        try:
+            parsed_statute = parse_statute(val)
+        except (StatuteFormatError, ILCSLookupError, MultipleMatchingILCSError) as e:
+            logger.warn(e)
+            # If we weren't able to parse the statute, return early
+            return self
+        else:
+            # We've parsed the statute. Format it, and save this value.
+            self.final_statute_formatted = format_statute(parsed_statute)
+
+        try:
+            offenses = get_iucr(parsed_statute)
+            if len(offenses) == 1:
+                self.iucr_code = offenses[0].code
+                self.iucr_category = offenses[0].offense_category
+            else:
+                logger.warn("Multiple matching IUCR offenses found for statute '{}'".format(val))
+        except IUCRLookupError as e:
+            # HACK: The original error will have a nicely-formatted statute.
+            # Replace it with the raw statute value
+            logger.warn(IUCRLookupError(val))
 
         return self
 
@@ -554,9 +558,15 @@ class Conviction(models.Model):
 
     chrgdispdate = models.DateField(null=True)
     final_statute = models.CharField(max_length=50, default="",
-        help_text="Field to make querying easier.  Set to the value of "
-        "ammndchargstatute if present, otherwise set to the value of statute",
+        help_text=("Field to make querying easier.  Set to the value of "
+                   "ammndchargstatute if present, otherwise set to the value "
+                   "of statute"),
         db_index=True)
+    final_statute_formatted = models.CharField(max_length=50, default="",
+        db_index=True,
+        help_text=("Value from final_statute but parsed and reformatted "
+                   "to try to normalize the formats and make grouping "
+                   "queries easier"))
     final_chrgdesc = models.CharField(max_length=50, default="", db_index=True)
     final_chrgtype = models.CharField(max_length=1, choices=CHRGTYPE_CHOICES,
         default="", db_index=True)
@@ -844,7 +854,7 @@ class County(geo_models.Model):
     intptlat10 = geo_models.CharField(max_length=11)
     intptlon10 = geo_models.CharField(max_length=12)
     geom = geo_models.MultiPolygonField()
-                                                                           
+
     objects = geo_models.GeoManager()
 
     FIELD_MAPPING = {

diff --git a/convictions_data/query/__init__.py b/convictions_data/query/__init__.py
@@ -187,6 +187,7 @@ def from_initial_chrgdispdate(self):
         'fbiidno',
         'fgrprntno',
         'final_statute',
+        'final_statute_formatted',
         'final_chrgdesc',
         'final_chrgtype',
         'final_chrgclass',

diff --git a/convictions_data/statute.py b/convictions_data/statute.py
@@ -29,7 +29,7 @@
 ]]
 ilcs_chapters_str = '|'.join(ilcs_chapters)
 ilcs_statute_re = re.compile(r"""(?P<chapter>{chapters})
-    [- ] # Delimiter between chapter and act prefix 
+    [- ] # Delimiter between chapter and act prefix
     (?P<act_prefix>\d+)
     [/\\] # Delimiter between act prefix and section
     (?P<section>[\da-zA-Z.]+(-[\da-zA-Z.]+){{0,1}})
@@ -41,7 +41,7 @@
     '38',
     '42',
     '56.5',
-    '95.5',      
+    '95.5',
     '121.5',
     '124',
     '134',
@@ -63,7 +63,7 @@ def __init__(self, chapter, paragraph, raw_statute=None):
 
     def __str__(self):
         msg = "Unable to find ILCS statute for raw statute '{}'".format(self.raw_statute)
-        
+
         return msg
 
 class StatuteFormatError(Exception):
@@ -74,6 +74,15 @@ def __init__(self, raw_statute):
     def __str__(self):
         return "Can't understand statute '{}'".format(self.raw_statute)
 
+class MultipleMatchingILCSError(Exception):
+    """Exception raised when an ILRS statute matches multiple ILCS statutes"""
+    def __init__(self, raw_statute):
+        self.raw_statute = raw_statute
+
+    def __str__(self):
+        return ("More than one matching ILCS sections "
+                "for raw statute '{}'".format(self.raw_statute))
+
 class IUCRLookupError(Exception):
     """Exception raised when a matching IUCR offense for an ILCS section cannot
     be found"""
@@ -102,7 +111,7 @@ def parse_statute(s):
     except KeyError:
         # No match
 
-        # Try stripping trailing bits from paragraph 
+        # Try stripping trailing bits from paragraph
         m = ilrs_paragraph_re.match(paragraph)
         if not m:
             raise ILCSLookupError(chapter, paragraph, s)
@@ -113,8 +122,9 @@ def parse_statute(s):
         except KeyError:
             raise ILCSLookupError(chapter, paragraph, s)
 
-    assert len(ilcs_sections) == 1, ("More than one matching ILCS sections "
-        "for raw statute '{}'".format(s))
+    if len(ilcs_sections) != 1:
+        raise MultipleMatchingILCSError(s)
+
     ilcs_section = ilcs_sections[0]
     ilcs_parsed = [
         (ilcs_section.chapter, 'chapter'),
@@ -128,9 +138,9 @@ def parse_statute(s):
 def parse_subsection(s):
     """
     Parse the subsection portion of a statute citation
-    
+
     Arguments:
-        s (str): String containing the subsection portion of statute citation 
+        s (str): String containing the subsection portion of statute citation
 
     Returns:
         List of strings representing the subsection bits
@@ -139,7 +149,7 @@ def parse_subsection(s):
     ['c', '2']
     """
     subsections = []
-    bits = re.split(r'[-(\s]', s) 
+    bits = re.split(r'[-(\s]', s)
     for bit in bits:
         if bit:
             subsections.append(re.sub(r'[)]$', '', bit))
@@ -149,7 +159,7 @@ def parse_ilcs_statute(s):
     statute_parts = []
     m = ilcs_statute_re.match(s)
     if not m:
-        return statute_parts 
+        return statute_parts
 
     statute_parts.append((m.group('chapter'), 'chapter'))
     statute_parts.append((m.group('act_prefix'), 'act_prefix'))
@@ -288,25 +298,42 @@ def fix_ambiguous_statute(s):
         except KeyError:
             return s
 
-def get_iucr(s):
+def format_statute(parsed_statute):
+    """Nicely format a parsed statute"""
+    chapter = parsed_statute[0][0]
+    act_prefix = parsed_statute[1][0]
+    section = parsed_statute[2][0]
+    subsections = [ss[0] for ss in parsed_statute[3:]]
+    formatted = "{}-{}/{}".format(chapter, act_prefix, section)
+
+    for ss in subsections:
+        formatted += "({})".format(ss)
+
+    return formatted
+
+def get_iucr(parsed_statute):
     try:
-        parsed = parse_statute(s)
-        chapter = parsed[0][0]
-        act_prefix = parsed[1][0]
-        section = parsed[2][0]
-        subsections = [ss[0] for ss in parsed[3:]]
+        chapter = parsed_statute[0][0]
+        act_prefix = parsed_statute[1][0]
+        section = parsed_statute[2][0]
+        subsections = [ss[0] for ss in parsed_statute[3:]]
         return iucr.lookup_by_ilcs(chapter, act_prefix, section, *subsections)
     except KeyError:
-        raise IUCRLookupError(s)
+        raise IUCRLookupError(format_statute(parsed_statute))
 
 def strip_surrounding_parens(s):
     """
     Strip surrounding parenthesis and curly braces from a statute string.
     """
     s = s.strip('{').strip('}')
-    if s[0] == "(" and s[2] != ")":
+
+    if len(s) == 0:
+        return s
+
+    if s[0] == "(" and (len(s) < 3 or s[2] != ")"):
         s = s[1:]
-    if s[-1] == ")" and s[-3] != "(":
+
+    if s[-1] == ")" and (len(s) < 3 or s[-3] != "("):
         s = s[:-1]
 
     return s
@@ -319,7 +346,7 @@ def strip_attempted_statute(s):
     For attempted offenses, this is some version of "720-5/8-4" (ILCS) or
     "38-8-4" (ILRS). For conspiracy offenses it's "720-5/8-2" and for
     solicitation it's "720-5/8-1".
-    
+
     The exact representation can vary widely.
 
     This function is needed because attempted crimes are represented by
@@ -329,7 +356,7 @@ def strip_attempted_statute(s):
     38-8-4(38-18-2)
 
     This breaks parsing the statutes for tasks like determining IUCR codes.
-    
+
     Returns:
         A tuple where the first item is the statute indicating the crime and
         the second item is the statute indicating an attempted offense. For