Skip to content

Commit

Permalink
Merge branch 'main' into 27-add-detection-of-ridgelys-notes-and-other…
Browse files Browse the repository at this point in the history
…-irregular-reporters
  • Loading branch information
quevon24 authored Feb 7, 2025
2 parents 56dd76e + 32ee756 commit ab077ed
Show file tree
Hide file tree
Showing 14 changed files with 557 additions and 76 deletions.
15 changes: 13 additions & 2 deletions CHANGES.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,14 +16,25 @@ Fixes:

## Current

**2.6.4 - 2024-06-03**
**2.6.5 - 2025-01-28**

Features:

- Add ReferenceCitation model and associated logic

Fixes:

- Bump eyecite to for InvalidError/hyperscan bug
- Fix court string matching with whitespace
- Fix court name issues

## Past

**2.6.4 - 2024-06-03**

Fixes:

- Bump eyecite to for InvalidError/hyperscan bug

**2.6.3 - 2024-04-09**

Fixes:
Expand Down
39 changes: 39 additions & 0 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,45 @@ Extracting Citations
that might refer to more than one reporter and can't be narrowed down by date.
3. :code:`tokenizer` ==> Tokenizer, default :code:`eyecite.tokenizers.default_tokenizer`: An instance of a Tokenizer object (see "Tokenizers" below).

Resolving Reference Citations
-----------------------------

Eyecite now supports a two-step process for extracting and resolving reference citations. This feature improves handling of citations that reference previously mentioned cases without explicitly repeating the full case name or citation.

Reference citations, such as “Theatre Enterprises at 552”, can be difficult to extract accurately if a judge is citing to `Theatre Enterprises, Inc. v. Paramount Film Distributing Corp., 346 U. S. 537, 541 (1954)` they lack a full case name. To address this, Eyecite allows for an initial citation extraction, followed by a secondary reference resolution step. If you have an external database (e.g., CourtListener) that provides resolved case names, you can use this feature to enhance citation finding.

from eyecite import get_citations
from eyecite.find import extract_reference_citations
from eyecite.helpers import filter_citations

plain_text = (
"quoting Theatre Enterprises, Inc. v. Paramount Film Distributing Corp., 346 U. S. 537, 541 (1954); "
"alterations in original). Thus, the District Court understood that allegations of "
"parallel business conduct, taken alone, do not state a claim under § 1; "
"plaintiffs must allege additional facts that “ten[d] to exclude independent "
"self-interested conduct as an As Theatre Enterprises at 552 held, parallel"
)

::

from eyecite import get_citations
from eyecite.find import extract_reference_citations
from eyecite.helpers import filter_citations

# Step 1: Extract full citations
citations = get_citations(plain_text)

# Step 2: Resolve the case name from an external database or prior knowledge
citations[0].metadata.resolved_case_name_short = "Theatre Enterprises"

# Step 3: Extract reference citations using the resolved name
references = extract_reference_citations(citations[0], plain_text)

# Step 4: Filter and merge citations
new_citations = filter_citations(citations + references)

Keep in mind that this feature requires an external database or heuristic method to resolve the short case name before extracting reference citations a second time.


Cleaning Input Text
-------------------
Expand Down
13 changes: 12 additions & 1 deletion TUTORIAL.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@
"metadata": {},
"outputs": [],
"source": [
"opinion_url = 'https://www.courtlistener.com/api/rest/v3/opinions/1741/'\n",
"opinion_url = 'https://www.courtlistener.com/api/rest/v4/opinions/1741/'\n",
"opinion_text = requests.get(opinion_url).json()['plain_text']"
]
},
Expand Down Expand Up @@ -163,6 +163,17 @@
"Next, we'll extract the citations using a custom tokenizer. Unlike the default tokenizer, here we'll use our hyperscan tokenizer for much faster extraction, which works by automatically pre-compiling and caching a regular expression database on first use. Because of this one-time pre-compilation stage, the first use of this tokenizer is slow:"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "1384d75b",
"metadata": {},
"outputs": [],
"source": [
"# install hyperscan if not already installed\n",
"# !pip install hyperscan"
]
},
{
"cell_type": "code",
"execution_count": 7,
Expand Down
31 changes: 24 additions & 7 deletions eyecite/annotate.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,18 @@
from bisect import bisect_left, bisect_right
from difflib import SequenceMatcher
from functools import partial
from logging import getLogger
from typing import Any, Callable, Iterable, Optional, Tuple

import fast_diff_match_patch

from eyecite.utils import is_balanced_html, wrap_html_tags
from eyecite.utils import (
is_balanced_html,
maybe_balance_style_tags,
wrap_html_tags,
)

logger = getLogger("eyecite")


def annotate_citations(
Expand Down Expand Up @@ -59,6 +66,9 @@ def annotate_citations(
Returns:
The annotated text.
"""
if unbalanced_tags not in ["unchecked", "skip", "wrap"]:
raise ValueError(f"Unknown option '{unbalanced_tags}")

# set up offset_updater if we have to move annotations to source_text
offset_updater = None
if source_text and source_text != plain_text:
Expand Down Expand Up @@ -88,13 +98,20 @@ def annotate_citations(
# handle HTML tags
if unbalanced_tags == "unchecked":
pass
elif unbalanced_tags in ("skip", "wrap"):
if not is_balanced_html(span_text):
if unbalanced_tags == "skip":
continue
elif not is_balanced_html(span_text):
if unbalanced_tags == "wrap":
span_text = wrap_html_tags(span_text, after, before)
else:
raise ValueError(f"Unknown option '{unbalanced_tags}")
else: # "skip" case
original_span_text = span_text
start, end, span_text = maybe_balance_style_tags(
start, end, plain_text
)
if not is_balanced_html(span_text):
logger.error(
"Citation was not annotated due to unbalanced tags %s",
original_span_text,
)
continue

if annotator is not None:
annotated_span = annotator(before, span_text, after)
Expand Down
19 changes: 14 additions & 5 deletions eyecite/find.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
)
from eyecite.regexes import SHORT_CITE_ANTECEDENT_REGEX, SUPRA_ANTECEDENT_REGEX
from eyecite.tokenizers import Tokenizer, default_tokenizer
from eyecite.utils import DISALLOWED_NAMES


def get_citations(
Expand Down Expand Up @@ -79,7 +80,7 @@ def get_citations(

# Check for reference citations that follow a full citation
# Using the plaintiff or defendant
references = _extract_reference_citations(citation, plain_text)
references = extract_reference_citations(citation, plain_text)
citations.extend(references)

# CASE 2: Token is an "Id." or "Ibid." reference.
Expand Down Expand Up @@ -124,8 +125,9 @@ def get_citations(
return citations


def _extract_reference_citations(
citation: FullCitation, plain_text: str
def extract_reference_citations(
citation: FullCitation,
plain_text: str,
) -> List[ReferenceCitation]:
"""Extract reference citations that follow a full citation
Expand All @@ -152,18 +154,19 @@ def is_valid_name(name: str) -> bool:
and name[0].isupper()
and not name.endswith(".")
and not name.isdigit()
and name.lower() not in DISALLOWED_NAMES
)

regexes = [
rf"(?P<{key}>{re.escape(value)})"
for key in ["plaintiff", "defendant"]
for key in ReferenceCitation.name_fields
if (value := getattr(citation.metadata, key, None))
and is_valid_name(value)
]
if not regexes:
return []
pin_cite_re = (
rf"\b(?:{'|'.join(regexes)})\s+at\s+(?P<pin_cite>\d{{1,5}})\b"
rf"\b(?:{'|'.join(regexes)})\s+at(\s¶)?\s+(?P<pin_cite>\d{{1,5}})\b"
)
reference_citations = []
remaining_text = plain_text[citation.span()[-1] :]
Expand Down Expand Up @@ -245,14 +248,18 @@ def _extract_shortform_citation(
strings_only=True,
forward=False,
)
offset = 0
if m:
ante_start, ante_end = m.span()
offset = ante_end - ante_start
antecedent_guess = m["antecedent"].strip()

# Get pin_cite
cite_token = cast(CitationToken, words[index])
pin_cite, span_end, parenthetical = extract_pin_cite(
words, index, prefix=cite_token.groups["page"]
)
span_end = span_end if span_end else 0

# make ShortCaseCitation
citation = ShortCaseCitation(
Expand All @@ -261,6 +268,8 @@ def _extract_shortform_citation(
exact_editions=cite_token.exact_editions,
variation_editions=cite_token.variation_editions,
span_end=span_end,
full_span_start=cite_token.start - offset,
full_span_end=max([span_end, cite_token.end]),
metadata={
"antecedent_guess": antecedent_guess,
"pin_cite": pin_cite,
Expand Down
47 changes: 40 additions & 7 deletions eyecite/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
FullJournalCitation,
FullLawCitation,
ParagraphToken,
ReferenceCitation,
ResourceCitation,
StopWordToken,
Token,
Expand Down Expand Up @@ -100,6 +101,17 @@ def add_post_citation(citation: CaseCitation, words: Tokens) -> None:
citation.metadata.pin_cite = clean_pin_cite(m["pin_cite"]) or None
citation.metadata.extra = (m["extra"] or "").strip() or None
citation.metadata.parenthetical = process_parenthetical(m["parenthetical"])

if (
citation.full_span_end
and m["parenthetical"] is not None
and isinstance(citation.metadata.parenthetical, str)
):
if len(m["parenthetical"]) > len(citation.metadata.parenthetical):
offset = len(m["parenthetical"]) - len(
citation.metadata.parenthetical
)
citation.full_span_end = citation.full_span_end - offset
citation.metadata.year = m["year"]
if m["year"]:
citation.year = get_year(m["year"])
Expand Down Expand Up @@ -317,6 +329,15 @@ def disambiguate_reporters(
]


def overlapping_citations(
full_span_1: Tuple[int, int], full_span_2: Tuple[int, int]
) -> bool:
"""Check if citations overlap at all"""
start_1, end_1 = full_span_1
start_2, end_2 = full_span_2
return max(start_1, start_2) < min(end_1, end_2)


def filter_citations(citations: List[CitationBase]) -> List[CitationBase]:
"""Filter and order citations, ensuring reference citations are in sequence
Expand All @@ -328,18 +349,30 @@ def filter_citations(citations: List[CitationBase]) -> List[CitationBase]:
:param citations: List of citations
:return: Sorted and filtered citations
"""
citations = list(
{citation.span(): citation for citation in citations}.values()
)
filtered_citations: List[CitationBase] = []
sorted_citations = sorted(citations, key=lambda citation: citation.span())
sorted_citations = sorted(
citations, key=lambda citation: citation.full_span()
)
for citation in sorted_citations:
if filtered_citations:
last_citation = filtered_citations[-1]
last_span = last_citation.span()
current_span = citation.span()

if current_span[0] <= last_span[1]:
# Remove overlapping citations that can occur in edge cases
is_overlapping = overlapping_citations(
citation.full_span(), last_citation.full_span()
)
if is_overlapping and isinstance(last_citation, ReferenceCitation):
# Remove the overlapping reference citation
filtered_citations.pop(-1)
filtered_citations.append(citation)
continue
filtered_citations.append(citation)
if is_overlapping and isinstance(citation, ReferenceCitation):
# Skip overlapping reference citations
continue
filtered_citations.append(citation)
else:
filtered_citations.append(citation)
return filtered_citations


Expand Down
12 changes: 12 additions & 0 deletions eyecite/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -456,6 +456,9 @@ class Metadata(CaseCitation.Metadata):
plaintiff: Optional[str] = None
defendant: Optional[str] = None
extra: Optional[str] = None
# May be populated after citation resolution
resolved_case_name_short: Optional[str] = None
resolved_case_name: Optional[str] = None

def add_metadata(self, words: "Tokens"):
"""Extract metadata from text before and after citation."""
Expand Down Expand Up @@ -604,6 +607,15 @@ class Metadata(CitationBase.Metadata):
plaintiff: Optional[str] = None
defendant: Optional[str] = None
pin_cite: Optional[str] = None
resolved_case_name_short: Optional[str] = None
resolved_case_name: Optional[str] = None

name_fields = [
"plaintiff",
"defendant",
"resolved_case_name_short",
"resolved_case_name",
]


@dataclass(eq=False, unsafe_hash=False, repr=False)
Expand Down
2 changes: 1 addition & 1 deletion eyecite/regexes.py
Original file line number Diff line number Diff line change
Expand Up @@ -212,7 +212,7 @@ def short_cite_re(regex):
# What case does a short cite refer to? For now, we just capture the previous
# word optionally followed by a comma. Example: Adarand, 515 U.S. at 241.
SHORT_CITE_ANTECEDENT_REGEX = r"""
(?P<antecedent>[\w\-.]+),?
(?P<antecedent>[A-Za-z][\w\-.]+),?
\ # final space
"""

Expand Down
Loading

0 comments on commit ab077ed

Please sign in to comment.