Merge branch 'main' into 27-add-detection-of-ridgelys-notes-and-other…

…-irregular-reporters
freelawproject · Feb 7, 2025 · ab077ed · ab077ed
2 parents 56dd76e + 32ee756
commit ab077ed
Show file tree

Hide file tree

Showing 14 changed files with 557 additions and 76 deletions.
diff --git a/CHANGES.md b/CHANGES.md
@@ -16,14 +16,25 @@ Fixes:
 
 ## Current
 
-**2.6.4 - 2024-06-03**
+**2.6.5 - 2025-01-28**
+
+Features:
+
+- Add ReferenceCitation model and associated logic
 
 Fixes:
 
-- Bump eyecite to for InvalidError/hyperscan bug
+- Fix court string matching with whitespace
+- Fix court name issues
 
 ## Past
 
+**2.6.4 - 2024-06-03**
+
+Fixes:
+
+- Bump eyecite to for InvalidError/hyperscan bug
+
 **2.6.3 - 2024-04-09**
 
 Fixes:

diff --git a/README.rst b/README.rst
@@ -119,6 +119,45 @@ Extracting Citations
    that might refer to more than one reporter and can't be narrowed down by date.
 3. :code:`tokenizer` ==> Tokenizer, default :code:`eyecite.tokenizers.default_tokenizer`: An instance of a Tokenizer object (see "Tokenizers" below).
 
+Resolving Reference Citations
+-----------------------------
+
+Eyecite now supports a two-step process for extracting and resolving reference citations. This feature improves handling of citations that reference previously mentioned cases without explicitly repeating the full case name or citation.
+
+Reference citations, such as “Theatre Enterprises at 552”, can be difficult to extract accurately if a judge is citing to `Theatre Enterprises, Inc. v. Paramount Film Distributing Corp., 346 U. S. 537, 541 (1954)` they lack a full case name. To address this, Eyecite allows for an initial citation extraction, followed by a secondary reference resolution step. If you have an external database (e.g., CourtListener) that provides resolved case names, you can use this feature to enhance citation finding.
+
+from eyecite import get_citations
+from eyecite.find import extract_reference_citations
+from eyecite.helpers import filter_citations
+
+plain_text = (
+    "quoting Theatre Enterprises, Inc. v. Paramount Film Distributing Corp., 346 U. S. 537, 541 (1954); "
+    "alterations in original). Thus, the District Court understood that allegations of "
+    "parallel business conduct, taken alone, do not state a claim under § 1; "
+    "plaintiffs must allege additional facts that “ten[d] to exclude independent "
+    "self-interested conduct as an As Theatre Enterprises at 552 held, parallel"
+)
+
+::
+
+    from eyecite import get_citations
+    from eyecite.find import extract_reference_citations
+    from eyecite.helpers import filter_citations
+
+    # Step 1: Extract full citations
+    citations = get_citations(plain_text)
+
+    # Step 2: Resolve the case name from an external database or prior knowledge
+    citations[0].metadata.resolved_case_name_short = "Theatre Enterprises"
+
+    # Step 3: Extract reference citations using the resolved name
+    references = extract_reference_citations(citations[0], plain_text)
+
+    # Step 4: Filter and merge citations
+    new_citations = filter_citations(citations + references)
+
+Keep in mind that this feature requires an external database or heuristic method to resolve the short case name before extracting reference citations a second time.
+
 
 Cleaning Input Text
 -------------------

diff --git a/TUTORIAL.ipynb b/TUTORIAL.ipynb
@@ -54,7 +54,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "opinion_url = 'https://www.courtlistener.com/api/rest/v3/opinions/1741/'\n",
+    "opinion_url = 'https://www.courtlistener.com/api/rest/v4/opinions/1741/'\n",
     "opinion_text = requests.get(opinion_url).json()['plain_text']"
    ]
   },
@@ -163,6 +163,17 @@
     "Next, we'll extract the citations using a custom tokenizer. Unlike the default tokenizer, here we'll use our hyperscan tokenizer for much faster extraction, which works by automatically pre-compiling and caching a regular expression database on first use. Because of this one-time pre-compilation stage, the first use of this tokenizer is slow:"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1384d75b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# install hyperscan if not already installed\n",
+    "# !pip install hyperscan"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 7,

diff --git a/eyecite/annotate.py b/eyecite/annotate.py
@@ -1,11 +1,18 @@
 from bisect import bisect_left, bisect_right
 from difflib import SequenceMatcher
 from functools import partial
+from logging import getLogger
 from typing import Any, Callable, Iterable, Optional, Tuple
 
 import fast_diff_match_patch
 
-from eyecite.utils import is_balanced_html, wrap_html_tags
+from eyecite.utils import (
+    is_balanced_html,
+    maybe_balance_style_tags,
+    wrap_html_tags,
+)
+
+logger = getLogger("eyecite")
 
 
 def annotate_citations(
@@ -59,6 +66,9 @@ def annotate_citations(
     Returns:
         The annotated text.
     """
+    if unbalanced_tags not in ["unchecked", "skip", "wrap"]:
+        raise ValueError(f"Unknown option '{unbalanced_tags}")
+
     # set up offset_updater if we have to move annotations to source_text
     offset_updater = None
     if source_text and source_text != plain_text:
@@ -88,13 +98,20 @@ def annotate_citations(
         # handle HTML tags
         if unbalanced_tags == "unchecked":
             pass
-        elif unbalanced_tags in ("skip", "wrap"):
-            if not is_balanced_html(span_text):
-                if unbalanced_tags == "skip":
-                    continue
+        elif not is_balanced_html(span_text):
+            if unbalanced_tags == "wrap":
                 span_text = wrap_html_tags(span_text, after, before)
-        else:
-            raise ValueError(f"Unknown option '{unbalanced_tags}")
+            else:  # "skip" case
+                original_span_text = span_text
+                start, end, span_text = maybe_balance_style_tags(
+                    start, end, plain_text
+                )
+                if not is_balanced_html(span_text):
+                    logger.error(
+                        "Citation was not annotated due to unbalanced tags %s",
+                        original_span_text,
+                    )
+                    continue
 
         if annotator is not None:
             annotated_span = annotator(before, span_text, after)

diff --git a/eyecite/find.py b/eyecite/find.py
@@ -29,6 +29,7 @@
 )
 from eyecite.regexes import SHORT_CITE_ANTECEDENT_REGEX, SUPRA_ANTECEDENT_REGEX
 from eyecite.tokenizers import Tokenizer, default_tokenizer
+from eyecite.utils import DISALLOWED_NAMES
 
 
 def get_citations(
@@ -79,7 +80,7 @@ def get_citations(
 
                 # Check for reference citations that follow a full citation
                 # Using the plaintiff or defendant
-                references = _extract_reference_citations(citation, plain_text)
+                references = extract_reference_citations(citation, plain_text)
                 citations.extend(references)
 
         # CASE 2: Token is an "Id." or "Ibid." reference.
@@ -124,8 +125,9 @@ def get_citations(
     return citations
 
 
-def _extract_reference_citations(
-    citation: FullCitation, plain_text: str
+def extract_reference_citations(
+    citation: FullCitation,
+    plain_text: str,
 ) -> List[ReferenceCitation]:
     """Extract reference citations that follow a full citation
 
@@ -152,18 +154,19 @@ def is_valid_name(name: str) -> bool:
             and name[0].isupper()
             and not name.endswith(".")
             and not name.isdigit()
+            and name.lower() not in DISALLOWED_NAMES
         )
 
     regexes = [
         rf"(?P<{key}>{re.escape(value)})"
-        for key in ["plaintiff", "defendant"]
+        for key in ReferenceCitation.name_fields
         if (value := getattr(citation.metadata, key, None))
         and is_valid_name(value)
     ]
     if not regexes:
         return []
     pin_cite_re = (
-        rf"\b(?:{'|'.join(regexes)})\s+at\s+(?P<pin_cite>\d{{1,5}})\b"
+        rf"\b(?:{'|'.join(regexes)})\s+at(\s¶)?\s+(?P<pin_cite>\d{{1,5}})\b"
     )
     reference_citations = []
     remaining_text = plain_text[citation.span()[-1] :]
@@ -245,14 +248,18 @@ def _extract_shortform_citation(
         strings_only=True,
         forward=False,
     )
+    offset = 0
     if m:
+        ante_start, ante_end = m.span()
+        offset = ante_end - ante_start
         antecedent_guess = m["antecedent"].strip()
 
     # Get pin_cite
     cite_token = cast(CitationToken, words[index])
     pin_cite, span_end, parenthetical = extract_pin_cite(
         words, index, prefix=cite_token.groups["page"]
     )
+    span_end = span_end if span_end else 0
 
     # make ShortCaseCitation
     citation = ShortCaseCitation(
@@ -261,6 +268,8 @@ def _extract_shortform_citation(
         exact_editions=cite_token.exact_editions,
         variation_editions=cite_token.variation_editions,
         span_end=span_end,
+        full_span_start=cite_token.start - offset,
+        full_span_end=max([span_end, cite_token.end]),
         metadata={
             "antecedent_guess": antecedent_guess,
             "pin_cite": pin_cite,

diff --git a/eyecite/helpers.py b/eyecite/helpers.py
@@ -12,6 +12,7 @@
     FullJournalCitation,
     FullLawCitation,
     ParagraphToken,
+    ReferenceCitation,
     ResourceCitation,
     StopWordToken,
     Token,
@@ -100,6 +101,17 @@ def add_post_citation(citation: CaseCitation, words: Tokens) -> None:
     citation.metadata.pin_cite = clean_pin_cite(m["pin_cite"]) or None
     citation.metadata.extra = (m["extra"] or "").strip() or None
     citation.metadata.parenthetical = process_parenthetical(m["parenthetical"])
+
+    if (
+        citation.full_span_end
+        and m["parenthetical"] is not None
+        and isinstance(citation.metadata.parenthetical, str)
+    ):
+        if len(m["parenthetical"]) > len(citation.metadata.parenthetical):
+            offset = len(m["parenthetical"]) - len(
+                citation.metadata.parenthetical
+            )
+            citation.full_span_end = citation.full_span_end - offset
     citation.metadata.year = m["year"]
     if m["year"]:
         citation.year = get_year(m["year"])
@@ -317,6 +329,15 @@ def disambiguate_reporters(
     ]
 
 
+def overlapping_citations(
+    full_span_1: Tuple[int, int], full_span_2: Tuple[int, int]
+) -> bool:
+    """Check if citations overlap at all"""
+    start_1, end_1 = full_span_1
+    start_2, end_2 = full_span_2
+    return max(start_1, start_2) < min(end_1, end_2)
+
+
 def filter_citations(citations: List[CitationBase]) -> List[CitationBase]:
     """Filter and order citations, ensuring reference citations are in sequence
 
@@ -328,18 +349,30 @@ def filter_citations(citations: List[CitationBase]) -> List[CitationBase]:
     :param citations: List of citations
     :return: Sorted and filtered citations
     """
+    citations = list(
+        {citation.span(): citation for citation in citations}.values()
+    )
     filtered_citations: List[CitationBase] = []
-    sorted_citations = sorted(citations, key=lambda citation: citation.span())
+    sorted_citations = sorted(
+        citations, key=lambda citation: citation.full_span()
+    )
     for citation in sorted_citations:
         if filtered_citations:
             last_citation = filtered_citations[-1]
-            last_span = last_citation.span()
-            current_span = citation.span()
-
-            if current_span[0] <= last_span[1]:
-                # Remove overlapping citations that can occur in edge cases
+            is_overlapping = overlapping_citations(
+                citation.full_span(), last_citation.full_span()
+            )
+            if is_overlapping and isinstance(last_citation, ReferenceCitation):
+                # Remove the overlapping reference citation
+                filtered_citations.pop(-1)
+                filtered_citations.append(citation)
                 continue
-        filtered_citations.append(citation)
+            if is_overlapping and isinstance(citation, ReferenceCitation):
+                # Skip overlapping reference citations
+                continue
+            filtered_citations.append(citation)
+        else:
+            filtered_citations.append(citation)
     return filtered_citations
 
 

diff --git a/eyecite/models.py b/eyecite/models.py
@@ -456,6 +456,9 @@ class Metadata(CaseCitation.Metadata):
         plaintiff: Optional[str] = None
         defendant: Optional[str] = None
         extra: Optional[str] = None
+        # May be populated after citation resolution
+        resolved_case_name_short: Optional[str] = None
+        resolved_case_name: Optional[str] = None
 
     def add_metadata(self, words: "Tokens"):
         """Extract metadata from text before and after citation."""
@@ -604,6 +607,15 @@ class Metadata(CitationBase.Metadata):
         plaintiff: Optional[str] = None
         defendant: Optional[str] = None
         pin_cite: Optional[str] = None
+        resolved_case_name_short: Optional[str] = None
+        resolved_case_name: Optional[str] = None
+
+    name_fields = [
+        "plaintiff",
+        "defendant",
+        "resolved_case_name_short",
+        "resolved_case_name",
+    ]
 
 
 @dataclass(eq=False, unsafe_hash=False, repr=False)

diff --git a/eyecite/regexes.py b/eyecite/regexes.py
@@ -212,7 +212,7 @@ def short_cite_re(regex):
 # What case does a short cite refer to? For now, we just capture the previous
 # word optionally followed by a comma. Example: Adarand, 515 U.S. at 241.
 SHORT_CITE_ANTECEDENT_REGEX = r"""
-    (?P<antecedent>[\w\-.]+),?
+    (?P<antecedent>[A-Za-z][\w\-.]+),?
     \   # final space
 """