From f2ec1e11dcc432a4aacc8c5f88d691f72cd0b9d5 Mon Sep 17 00:00:00 2001
From: William Palin
Date: Fri, 10 Jan 2025 14:38:30 -0500
Subject: [PATCH 01/40] feat(find): Add reference citation to model
Add XYZ at 123 to eyecite capability
Improve and recognize parallel citations
---
eyecite/find.py | 72 ++++++++++++++++++++++++++++++++++++--
eyecite/helpers.py | 9 +++++
eyecite/models.py | 45 ++++++++++++++++++++++++
eyecite/resolve.py | 59 +++++++++++++++++++++++++++++++
eyecite/test_factories.py | 9 +++++
tests/test_AnnotateTest.py | 6 ++++
tests/test_FindTest.py | 52 ++++++++++++++++++++++++++-
tests/test_ResolveTest.py | 61 ++++++++++++++++++++++++++++++++
8 files changed, 310 insertions(+), 3 deletions(-)
diff --git a/eyecite/find.py b/eyecite/find.py
index 7c4d9758..4e9defc2 100644
--- a/eyecite/find.py
+++ b/eyecite/find.py
@@ -1,3 +1,4 @@
+import re
from typing import List, Type, cast
from eyecite.helpers import (
@@ -5,6 +6,7 @@
extract_pin_cite,
joke_cite,
match_on_tokens,
+ order_citations,
)
from eyecite.models import (
CitationBase,
@@ -15,6 +17,7 @@
FullLawCitation,
IdCitation,
IdToken,
+ ReferenceCitation,
ResourceCitation,
SectionToken,
ShortCaseCitation,
@@ -22,6 +25,7 @@
SupraToken,
Tokens,
UnknownCitation,
+ CaseReferenceToken,
)
from eyecite.regexes import SHORT_CITE_ANTECEDENT_REGEX, SUPRA_ANTECEDENT_REGEX
from eyecite.tokenizers import Tokenizer, default_tokenizer
@@ -69,7 +73,15 @@ def get_citations(
if citation_token.short:
citation = _extract_shortform_citation(words, i)
else:
- citation = _extract_full_citation(words, i)
+ citation: FullCitation = _extract_full_citation(words, i)
+ if citations and citation.is_parallel_citation(citations[-1]):
+ # Check if parallel citation and merge plaintiff/defendants
+ citation.is_parallel_citation(citations[-1])
+
+ # Check for reference citations that follow a full citation
+ # Using the plaintiff or defendant
+ references = _extract_reference_citations(citation, plain_text)
+ citations.extend(references)
# CASE 2: Token is an "Id." or "Ibid." reference.
# In this case, the citation should simply be to the item cited
@@ -99,6 +111,8 @@ def get_citations(
citations.append(citation)
+ citations = order_citations(citations)
+
# Remove citations with multiple reporter candidates where we couldn't
# guess correct reporter
if remove_ambiguous:
@@ -107,10 +121,64 @@ def get_citations(
# Returns a list of citations ordered in the sequence that they appear in
# the document. The ordering of this list is important for reconstructing
# the references of the ShortCaseCitation, SupraCitation, and
- # IdCitation objects.
+ # IdCitation and ReferenceCitation objects.
return citations
+def _extract_reference_citations(
+ citation: FullCitation, plain_text: str
+) -> List[ReferenceCitation]:
+ """Extract reference citations that follow a full citation
+
+ :param citation: the full case citation found
+ :param plain_text: the text
+ :return: Pincite reference citations
+ """
+ if type(citation) != FullCaseCitation:
+ # Skip if not case law citation
+ return []
+ if not citation.metadata.defendant:
+ # Skip if no defendant exists
+ return []
+ escaped_plaintiff = re.escape(citation.metadata.plaintiff or "")
+ escaped_defendant = re.escape(citation.metadata.defendant)
+ pin_cite_regex = (
+ rf"\b(?:"
+ rf"(?P{escaped_plaintiff})|"
+ rf"(?P{escaped_defendant})\s?"
+ rf")\s+at\s+(?P\d{{1,5}})?\b"
+ )
+
+ pin_cite_pattern = re.compile(pin_cite_regex)
+ reference_citations = []
+ if len(plain_text) <= citation.span()[-1]:
+ return []
+
+ remaining_text = plain_text[citation.span()[-1] :]
+ offset = citation.span()[-1]
+ for match in pin_cite_pattern.finditer(remaining_text):
+ start, end = match.span()
+ matched_text = match.group(0)
+
+ reference = ReferenceCitation(
+ token=CaseReferenceToken(
+ data=matched_text, start=start + offset, end=end + offset
+ ),
+ span_start=start + offset,
+ span_end=end + offset,
+ full_span_start=start + offset,
+ full_span_end=end + offset,
+ index=0,
+ metadata={
+ "plaintiff": match.group("plaintiff"),
+ "defendant": match.group("defendant"),
+ "pin_cite": match.group("page"),
+ },
+ )
+ reference_citations.append(reference)
+ return reference_citations
+
+
def _extract_full_citation(
words: Tokens,
index: int,
diff --git a/eyecite/helpers.py b/eyecite/helpers.py
index 4380be6d..4087a717 100644
--- a/eyecite/helpers.py
+++ b/eyecite/helpers.py
@@ -306,6 +306,15 @@ def disambiguate_reporters(
if not isinstance(c, ResourceCitation) or c.edition_guess
]
+def order_citations(citations: List[CitationBase]) -> List[CitationBase]:
+ """
+ Order citations that may have reference citations out or sequential order
+
+ :param citations: List of citation`
+ :return: Sorted citations
+ """
+ return sorted(citations, key=lambda citation: citation.span())
+
joke_cite: List[CitationBase] = [
FullCaseCitation(
diff --git a/eyecite/models.py b/eyecite/models.py
index 69a7d9a0..762dbbbf 100644
--- a/eyecite/models.py
+++ b/eyecite/models.py
@@ -301,6 +301,26 @@ class FullCitation(ResourceCitation):
"""Abstract base class indicating that a citation fully identifies a
resource."""
+ def is_parallel_citation(self, preceding: CitationBase):
+ """Check if preceding citation is parallel
+
+ If parallel match plaintiff and defendant metadata
+
+ Args:
+ preceding (): The previous citation found
+
+ Returns: None
+
+ """
+ is_parallel = (
+ self.full_span_start == preceding.full_span_start
+ and self.full_span_end == preceding.full_span_end
+ and type(preceding) == FullCaseCitation
+ )
+ if is_parallel:
+ self.metadata.defendant = preceding.metadata.defendant
+ self.metadata.plaintiff = preceding.metadata.plaintiff
+
@dataclass(eq=False, unsafe_hash=False, repr=False)
class FullLawCitation(FullCitation):
@@ -566,6 +586,26 @@ def formatted(self):
return "".join(parts)
+@dataclass(eq=False, unsafe_hash=False, repr=False)
+class ReferenceCitation(CitationBase):
+ """A reference citation is a citation that refers to
+ a full case citation by name and pincite alone.
+
+ Future versions hopefully with drop the pincite requirement
+
+ Examples:
+ Roe at 240
+ """
+
+ @dataclass(eq=True, unsafe_hash=True)
+ class Metadata(CitationBase.Metadata):
+ """Define fields on self.metadata."""
+
+ plaintiff: Optional[str] = None
+ defendant: Optional[str] = None
+ pin_cite: Optional[str] = None
+
+
@dataclass(eq=False, unsafe_hash=False, repr=False)
class UnknownCitation(CitationBase):
"""Convenience class which represents an unknown citation. A recognized
@@ -679,6 +719,11 @@ class StopWordToken(Token):
"""Word matching one of the STOP_TOKENS."""
+@dataclass(eq=True, unsafe_hash=True)
+class CaseReferenceToken(Token):
+ """Word matching plaintiff or defendant in a full case citation"""
+
+
@dataclass
class TokenExtractor:
"""Class for extracting all matches from a given string for the given
diff --git a/eyecite/resolve.py b/eyecite/resolve.py
index 5001cb90..440dd0ba 100644
--- a/eyecite/resolve.py
+++ b/eyecite/resolve.py
@@ -11,6 +11,7 @@
ResourceType,
ShortCaseCitation,
SupraCitation,
+ ReferenceCitation,
)
from eyecite.utils import strip_punct
@@ -83,6 +84,34 @@ def _filter_by_matching_antecedent(
return matches[0] if len(matches) == 1 else None
+def _filter_by_matching_plaintiff_or_defendant(
+ resolved_full_cites: ResolvedFullCites,
+ plaintiff: str,
+ defendant: str,
+) -> Optional[ResourceType]:
+ """Filter out any impossible reference citations"""
+ matches: List[ResourceType] = []
+ plaintiff: str = strip_punct(plaintiff or "")
+ defendant: str = strip_punct(defendant or "")
+ for full_citation, resource in resolved_full_cites:
+ if not isinstance(full_citation, FullCaseCitation):
+ continue
+ if (
+ full_citation.metadata.defendant
+ and defendant in full_citation.metadata.defendant
+ ):
+ matches.append(resource)
+ elif (
+ full_citation.metadata.plaintiff
+ and plaintiff in full_citation.metadata.plaintiff
+ ):
+ matches.append(resource)
+
+ # Remove duplicates and only accept if one candidate remains
+ matches = list(set(matches))
+ return matches[0] if len(matches) == 1 else None
+
+
def _has_invalid_pin_cite(
full_cite: FullCitation, id_cite: IdCitation
) -> bool:
@@ -180,6 +209,27 @@ def _resolve_supra_citation(
)
+def _resolve_reference_citation(
+ reference_citation: ReferenceCitation,
+ resolved_full_cites: ResolvedFullCites,
+) -> Optional[ResourceType]:
+ """
+ Try to resolve reference citations by checking whether their is only one
+ full citation that appears with either the defendant or plaintiff
+ field of any of the previously resolved full citations.
+ """
+ if (
+ not reference_citation.metadata.defendant
+ and not reference_citation.metadata.plaintiff
+ ):
+ return None
+ return _filter_by_matching_plaintiff_or_defendant(
+ resolved_full_cites,
+ reference_citation.metadata.plaintiff,
+ reference_citation.metadata.defendant,
+ )
+
+
def _resolve_id_citation(
id_citation: IdCitation,
last_resolution: ResourceType,
@@ -214,6 +264,10 @@ def resolve_citations(
[SupraCitation, ResolvedFullCites],
Optional[ResourceType],
] = _resolve_supra_citation,
+ resolve_reference_citation: Callable[
+ [ReferenceCitation, ResolvedFullCites],
+ Optional[ResourceType],
+ ] = _resolve_reference_citation,
resolve_id_citation: Callable[
[IdCitation, ResourceType, Resolutions], Optional[ResourceType]
] = _resolve_id_citation,
@@ -286,6 +340,11 @@ def resolve_citations(
elif isinstance(citation, SupraCitation):
resolution = resolve_supra_citation(citation, resolved_full_cites)
+ elif isinstance(citation, ReferenceCitation):
+ resolution = resolve_reference_citation(
+ citation, resolved_full_cites
+ )
+
# If the citation is an id citation, try to resolve it
elif isinstance(citation, IdCitation):
resolution = resolve_id_citation(
diff --git a/eyecite/test_factories.py b/eyecite/test_factories.py
index b200ebb2..a14aa077 100644
--- a/eyecite/test_factories.py
+++ b/eyecite/test_factories.py
@@ -11,6 +11,8 @@
SupraCitation,
SupraToken,
UnknownCitation,
+ ReferenceCitation,
+ CaseReferenceToken,
)
from eyecite.tokenizers import EDITIONS_LOOKUP
@@ -98,6 +100,13 @@ def id_citation(source_text=None, index=0, **kwargs):
return IdCitation(IdToken(source_text, 0, 99), index, **kwargs)
+def reference_citation(source_text=None, index=0, **kwargs):
+ """Convenience function for creating mock ReferenceCitation objects."""
+ return ReferenceCitation(
+ CaseReferenceToken(source_text, 0, 99), index, **kwargs
+ )
+
+
def unknown_citation(source_text=None, index=0, **kwargs):
"""Convenience function for creating mock UnknownCitation objects."""
return UnknownCitation(SectionToken(source_text, 0, 99), index, **kwargs)
diff --git a/tests/test_AnnotateTest.py b/tests/test_AnnotateTest.py
index 7ae3d711..a7da5888 100644
--- a/tests/test_AnnotateTest.py
+++ b/tests/test_AnnotateTest.py
@@ -47,6 +47,12 @@ def lower_annotator(before, text, after):
"<0>1 U.S. 10>. Foo v. Bar, <1>supra at 21>.",
[],
),
+ # Refernce cite
+ (
+ "Foo v. Bar 1 U.S. 1. In Foo at 2.",
+ "Foo v. Bar <0>1 U.S. 10>. In <1>Foo at 21>.",
+ [],
+ ),
# whitespace and html -- no unbalanced tag check
(
"foo 1 U.S. 1 bar",
diff --git a/tests/test_FindTest.py b/tests/test_FindTest.py
index 47339f1d..0da5d918 100644
--- a/tests/test_FindTest.py
+++ b/tests/test_FindTest.py
@@ -13,6 +13,7 @@
id_citation,
journal_citation,
law_citation,
+ reference_citation,
supra_citation,
unknown_citation,
)
@@ -132,7 +133,7 @@ def test_find_citations(self):
'parenthetical': 'overruling foo'}),
case_citation(page='2', reporter='S. Ct.', year=1982,
metadata={'plaintiff': 'lissner',
- 'defendant': 'test 1 U.S. 12, 347-348',
+ 'defendant': 'test',
'court': 'ca4',
'pin_cite': '358',
'parenthetical': 'overruling foo'}),
@@ -440,6 +441,55 @@ def test_find_citations(self):
[],),
('lorem 111 N. W. 12th St.',
[],),
+ ('General Casualty cites as compelling Amick v. Liberty Mut. Ins. Co., 455 A.2d 793 (R.I. 1983). In that case ... Stats, do. See Amick at 795',
+ [case_citation(page='793',
+ volume="455",
+ reporter="A.2d",
+ year=1983,
+ metadata={'plaintiff': 'Amick',
+ 'defendant': 'Liberty Mut. Ins. Co.',
+ 'court':'ri'
+ }),
+ reference_citation('Amick at 795', metadata={'plaintiff':'Amick', 'pin_cite': '795'})]),
+ # Test reference citation
+ ('Foo v. Bar 1 U.S. 12, 347-348. something something, In Foo at 62 we see that',
+ [case_citation(page='12',
+ metadata={'plaintiff': 'Foo',
+ 'defendant': 'Bar',
+ 'pin_cite': '347-348'}),
+ reference_citation('Foo at 62', metadata={'plaintiff':'Foo', 'pin_cite': '62'})]),
+ # Test that reference citation must occur after full case citation
+ ('In Foo at 62 we see that, Foo v. Bar 1 U.S. 12, 347-348. something something,',
+ [case_citation(page='12',
+ metadata={'plaintiff': 'Foo',
+ 'defendant': 'Bar',
+ 'pin_cite': '347-348'})]),
+ # Test reference against defendant name
+ ('In re Foo 1 Mass. 12, 347-348. something something, in Foo at 62 we see that, ',
+ [case_citation(page='12',
+ reporter="Mass.",
+ volume="1",
+ metadata={
+ 'defendant': 'Foo',
+ 'pin_cite': '347-348'
+ }),
+ reference_citation('Foo at 62', metadata={'defendant': 'Foo',
+ "pin_cite": "62"}),
+ ]),
+ # Test reference citation after an id citation
+ ('we said in Morton v. Mancari, 417 U. S. 535, 552 (1974) “Literally every piece ....”. “asisovereign tribal entities . . . .” Id. In Mancari at 665',
+ [case_citation(page='535',
+ year=1974,
+ volume="417",
+ reporter="U. S.",
+ metadata={'plaintiff': 'Morton',
+ 'defendant': 'Mancari',
+ "pin_cite": "552",
+ "court":"scotus",
+ }),
+ id_citation('Id.,', metadata={}),
+ reference_citation('Mancari', metadata={'defendant':'Mancari', "pin_cite": "665"}),
+ ]),
# Test Conn. Super. Ct. regex variation.
('Failed to recognize 1993 Conn. Super. Ct. 5243-P',
[case_citation(volume='1993', reporter='Conn. Super. Ct.',
diff --git a/tests/test_ResolveTest.py b/tests/test_ResolveTest.py
index 61c48987..f0e7f340 100644
--- a/tests/test_ResolveTest.py
+++ b/tests/test_ResolveTest.py
@@ -31,6 +31,43 @@ def assertResolution(self, citations, expected_resolution_dict):
format_resolution(expected_resolution_dict),
)
+ def checkReferenceResolution(self, *expected_resolutions: tuple[list[list[int]], str]):
+ """
+ Helper function to help test reference citations.
+
+ Args:
+ *expected_resolutions (tuple[list[int], str]):
+ A list of tuples where each tuple contains:
+ - A list of expected indices for the resolved citations.
+ - A string of citation text to process.
+
+ Returns:
+ None
+ """
+ for expected_indices, citation_text in expected_resolutions:
+ citations = get_citations(citation_text)
+
+ # Step 2: Build a helper dict to map corrected citations to indices
+ resolution_index_map = {
+ cite.corrected_citation(): idx for idx, cite in
+ enumerate(citations)
+ }
+
+ # Step 3: Resolve citations and format the resolution
+ resolved_citations = resolve_citations(citations)
+ formatted_resolution = format_resolution(resolved_citations)
+
+ # Step 4: Map resolved citations to their indices
+ result = {
+ key: [resolution_index_map[value] for value in values]
+ for key, values in formatted_resolution.items()
+ }
+
+ # Step 5: Compare the actual results with expected indices
+ actual_indices = list(result.values())
+ self.assertEqual(expected_indices, actual_indices)
+
+
def checkResolution(
self, *expected_resolutions: Tuple[Optional[int], str]
):
@@ -297,3 +334,27 @@ def test_complex_resolution(self):
),
(2, "However, this should succeed, Lorem, 1 U.S., at 52."),
)
+
+ def test_reference_resolution(self):
+ self.checkReferenceResolution(
+ (
+ [[0, 1]],
+ "Foo v. Bar, 1 U.S. 1 ... Foo at 2"
+ ),
+ (
+ [[0]],
+ "Foo at 2. .... ; Foo v. Bar, 1 U.S. 1"
+ ),
+ (
+ [[0, 1]],
+ "Foo v. Bar 1 U.S. 12, 347-348. something something, In Foo at 62 we see that"
+ ),
+ (
+ [[0, 2], [1]],
+ "Foo v. Bar 1 U.S. 12, 347-348; 12 U.S. 1. someting; In Foo at 2 we see that"
+ ),
+ (
+ [[0, 2], [1]],
+ "Foo v. Bar 1 U.S. 12, 347-348; In Smith, 12 U.S. 1 (1999) we saw something else. someting. In Foo at 2 we see that"
+ )
+ )
\ No newline at end of file
From 98b90a10c94fc900fe1227b801f4c0d005f507cc Mon Sep 17 00:00:00 2001
From: William Palin
Date: Fri, 10 Jan 2025 14:55:33 -0500
Subject: [PATCH 02/40] fix(eyecite): Lint
---
eyecite/helpers.py | 1 +
tests/test_ResolveTest.py | 29 ++++++++++++-----------------
2 files changed, 13 insertions(+), 17 deletions(-)
diff --git a/eyecite/helpers.py b/eyecite/helpers.py
index 4087a717..50285d65 100644
--- a/eyecite/helpers.py
+++ b/eyecite/helpers.py
@@ -306,6 +306,7 @@ def disambiguate_reporters(
if not isinstance(c, ResourceCitation) or c.edition_guess
]
+
def order_citations(citations: List[CitationBase]) -> List[CitationBase]:
"""
Order citations that may have reference citations out or sequential order
diff --git a/tests/test_ResolveTest.py b/tests/test_ResolveTest.py
index f0e7f340..6f644123 100644
--- a/tests/test_ResolveTest.py
+++ b/tests/test_ResolveTest.py
@@ -31,7 +31,9 @@ def assertResolution(self, citations, expected_resolution_dict):
format_resolution(expected_resolution_dict),
)
- def checkReferenceResolution(self, *expected_resolutions: tuple[list[list[int]], str]):
+ def checkReferenceResolution(
+ self, *expected_resolutions: tuple[list[list[int]], str]
+ ):
"""
Helper function to help test reference citations.
@@ -49,8 +51,8 @@ def checkReferenceResolution(self, *expected_resolutions: tuple[list[list[int]],
# Step 2: Build a helper dict to map corrected citations to indices
resolution_index_map = {
- cite.corrected_citation(): idx for idx, cite in
- enumerate(citations)
+ cite.corrected_citation(): idx
+ for idx, cite in enumerate(citations)
}
# Step 3: Resolve citations and format the resolution
@@ -67,7 +69,6 @@ def checkReferenceResolution(self, *expected_resolutions: tuple[list[list[int]],
actual_indices = list(result.values())
self.assertEqual(expected_indices, actual_indices)
-
def checkResolution(
self, *expected_resolutions: Tuple[Optional[int], str]
):
@@ -337,24 +338,18 @@ def test_complex_resolution(self):
def test_reference_resolution(self):
self.checkReferenceResolution(
+ ([[0, 1]], "Foo v. Bar, 1 U.S. 1 ... Foo at 2"),
+ ([[0]], "Foo at 2. .... ; Foo v. Bar, 1 U.S. 1"),
(
[[0, 1]],
- "Foo v. Bar, 1 U.S. 1 ... Foo at 2"
- ),
- (
- [[0]],
- "Foo at 2. .... ; Foo v. Bar, 1 U.S. 1"
- ),
- (
- [[0, 1]],
- "Foo v. Bar 1 U.S. 12, 347-348. something something, In Foo at 62 we see that"
+ "Foo v. Bar 1 U.S. 12, 347-348. something something, In Foo at 62 we see that",
),
(
[[0, 2], [1]],
- "Foo v. Bar 1 U.S. 12, 347-348; 12 U.S. 1. someting; In Foo at 2 we see that"
+ "Foo v. Bar 1 U.S. 12, 347-348; 12 U.S. 1. someting; In Foo at 2 we see that",
),
(
[[0, 2], [1]],
- "Foo v. Bar 1 U.S. 12, 347-348; In Smith, 12 U.S. 1 (1999) we saw something else. someting. In Foo at 2 we see that"
- )
- )
\ No newline at end of file
+ "Foo v. Bar 1 U.S. 12, 347-348; In Smith, 12 U.S. 1 (1999) we saw something else. someting. In Foo at 2 we see that",
+ ),
+ )
From fa419aaf6f37d3a9fdd3035026fc20806e15eadb Mon Sep 17 00:00:00 2001
From: William Palin
Date: Fri, 10 Jan 2025 15:08:34 -0500
Subject: [PATCH 03/40] fix(eyecite): Flake8 fixes
---
tests/test_FindTest.py | 35 +++++++++++------------------------
1 file changed, 11 insertions(+), 24 deletions(-)
diff --git a/tests/test_FindTest.py b/tests/test_FindTest.py
index 0da5d918..b85403ef 100644
--- a/tests/test_FindTest.py
+++ b/tests/test_FindTest.py
@@ -448,16 +448,16 @@ def test_find_citations(self):
year=1983,
metadata={'plaintiff': 'Amick',
'defendant': 'Liberty Mut. Ins. Co.',
- 'court':'ri'
+ 'court': 'ri'
}),
- reference_citation('Amick at 795', metadata={'plaintiff':'Amick', 'pin_cite': '795'})]),
+ reference_citation('Amick at 795', metadata={'plaintiff': 'Amick', 'pin_cite': '795'})]),
# Test reference citation
('Foo v. Bar 1 U.S. 12, 347-348. something something, In Foo at 62 we see that',
[case_citation(page='12',
metadata={'plaintiff': 'Foo',
'defendant': 'Bar',
'pin_cite': '347-348'}),
- reference_citation('Foo at 62', metadata={'plaintiff':'Foo', 'pin_cite': '62'})]),
+ reference_citation('Foo at 62', metadata={'plaintiff': 'Foo', 'pin_cite': '62'})]),
# Test that reference citation must occur after full case citation
('In Foo at 62 we see that, Foo v. Bar 1 U.S. 12, 347-348. something something,',
[case_citation(page='12',
@@ -466,30 +466,16 @@ def test_find_citations(self):
'pin_cite': '347-348'})]),
# Test reference against defendant name
('In re Foo 1 Mass. 12, 347-348. something something, in Foo at 62 we see that, ',
- [case_citation(page='12',
- reporter="Mass.",
- volume="1",
- metadata={
- 'defendant': 'Foo',
- 'pin_cite': '347-348'
- }),
- reference_citation('Foo at 62', metadata={'defendant': 'Foo',
- "pin_cite": "62"}),
- ]),
+ [case_citation(page='12', reporter="Mass.", volume="1",
+ metadata={'defendant': 'Foo', 'pin_cite': '347-348'})]),
# Test reference citation after an id citation
('we said in Morton v. Mancari, 417 U. S. 535, 552 (1974) “Literally every piece ....”. “asisovereign tribal entities . . . .” Id. In Mancari at 665',
- [case_citation(page='535',
- year=1974,
- volume="417",
+ [case_citation(page='535', year=1974, volume="417",
reporter="U. S.",
- metadata={'plaintiff': 'Morton',
- 'defendant': 'Mancari',
- "pin_cite": "552",
- "court":"scotus",
- }),
+ metadata={'plaintiff': 'Morton', 'defendant': 'Mancari', "pin_cite": "552", "court": "scotus"}),
id_citation('Id.,', metadata={}),
- reference_citation('Mancari', metadata={'defendant':'Mancari', "pin_cite": "665"}),
- ]),
+ reference_citation('Mancari',
+ metadata={'defendant': 'Mancari', "pin_cite": "665"})]),
# Test Conn. Super. Ct. regex variation.
('Failed to recognize 1993 Conn. Super. Ct. 5243-P',
[case_citation(volume='1993', reporter='Conn. Super. Ct.',
@@ -776,7 +762,8 @@ def test_citation_fullspan(self):
self.assertEqual(
extracted[cit_idx].full_span()[0],
start,
- f"full_span start index doesn't match for {extracted[cit_idx]}",
+ f"full_span start index doesn't match for {
+ extracted[cit_idx]}",
)
self.assertEqual(
extracted[cit_idx].full_span()[1],
From b56bf22e0b9d74af4f8761987fd90891dd2e0b5d Mon Sep 17 00:00:00 2001
From: William Palin
Date: Fri, 10 Jan 2025 15:26:54 -0500
Subject: [PATCH 04/40] fix(eyecite): Flake8 fixes
---
eyecite/find.py | 2 +-
eyecite/models.py | 6 +++---
tests/test_FindTest.py | 8 +++++---
3 files changed, 9 insertions(+), 7 deletions(-)
diff --git a/eyecite/find.py b/eyecite/find.py
index 4e9defc2..dfc717f2 100644
--- a/eyecite/find.py
+++ b/eyecite/find.py
@@ -134,7 +134,7 @@ def _extract_reference_citations(
:param plain_text: the text
:return: Pincite reference citations
"""
- if type(citation) != FullCaseCitation:
+ if not isinstance(citation, FullCaseCitation):
# Skip if not case law citation
return []
if not citation.metadata.defendant:
diff --git a/eyecite/models.py b/eyecite/models.py
index 762dbbbf..2d5d7a99 100644
--- a/eyecite/models.py
+++ b/eyecite/models.py
@@ -313,9 +313,9 @@ def is_parallel_citation(self, preceding: CitationBase):
"""
is_parallel = (
- self.full_span_start == preceding.full_span_start
- and self.full_span_end == preceding.full_span_end
- and type(preceding) == FullCaseCitation
+ self.full_span_start == preceding.full_span_start
+ and self.full_span_end == preceding.full_span_end
+ and isinstance(preceding, FullCaseCitation)
)
if is_parallel:
self.metadata.defendant = preceding.metadata.defendant
diff --git a/tests/test_FindTest.py b/tests/test_FindTest.py
index b85403ef..c04d7bbe 100644
--- a/tests/test_FindTest.py
+++ b/tests/test_FindTest.py
@@ -467,7 +467,10 @@ def test_find_citations(self):
# Test reference against defendant name
('In re Foo 1 Mass. 12, 347-348. something something, in Foo at 62 we see that, ',
[case_citation(page='12', reporter="Mass.", volume="1",
- metadata={'defendant': 'Foo', 'pin_cite': '347-348'})]),
+ metadata={'defendant': 'Foo', 'pin_cite': '347-348'}),
+ reference_citation('Foo at 62',
+ metadata={'defendant': 'Foo',
+ "pin_cite": "62"})]),
# Test reference citation after an id citation
('we said in Morton v. Mancari, 417 U. S. 535, 552 (1974) “Literally every piece ....”. “asisovereign tribal entities . . . .” Id. In Mancari at 665',
[case_citation(page='535', year=1974, volume="417",
@@ -762,8 +765,7 @@ def test_citation_fullspan(self):
self.assertEqual(
extracted[cit_idx].full_span()[0],
start,
- f"full_span start index doesn't match for {
- extracted[cit_idx]}",
+ f"full_span start index doesn't match for {extracted[cit_idx]}",
)
self.assertEqual(
extracted[cit_idx].full_span()[1],
From af97e06c6625c0cc1d21c0bd54c4ae6589bcfe4b Mon Sep 17 00:00:00 2001
From: William Palin
Date: Fri, 10 Jan 2025 15:28:59 -0500
Subject: [PATCH 05/40] fix(eyecite): Black lint
---
eyecite/models.py | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/eyecite/models.py b/eyecite/models.py
index 2d5d7a99..49cc3f10 100644
--- a/eyecite/models.py
+++ b/eyecite/models.py
@@ -313,9 +313,9 @@ def is_parallel_citation(self, preceding: CitationBase):
"""
is_parallel = (
- self.full_span_start == preceding.full_span_start
- and self.full_span_end == preceding.full_span_end
- and isinstance(preceding, FullCaseCitation)
+ self.full_span_start == preceding.full_span_start
+ and self.full_span_end == preceding.full_span_end
+ and isinstance(preceding, FullCaseCitation)
)
if is_parallel:
self.metadata.defendant = preceding.metadata.defendant
From 8014239fae93bd90c30454de466b4c16c226e331 Mon Sep 17 00:00:00 2001
From: William Palin
Date: Fri, 10 Jan 2025 15:39:48 -0500
Subject: [PATCH 06/40] fix(eyecite): iSort
---
eyecite/find.py | 2 +-
eyecite/resolve.py | 2 +-
eyecite/test_factories.py | 4 ++--
3 files changed, 4 insertions(+), 4 deletions(-)
diff --git a/eyecite/find.py b/eyecite/find.py
index dfc717f2..05f70ddb 100644
--- a/eyecite/find.py
+++ b/eyecite/find.py
@@ -9,6 +9,7 @@
order_citations,
)
from eyecite.models import (
+ CaseReferenceToken,
CitationBase,
CitationToken,
FullCaseCitation,
@@ -25,7 +26,6 @@
SupraToken,
Tokens,
UnknownCitation,
- CaseReferenceToken,
)
from eyecite.regexes import SHORT_CITE_ANTECEDENT_REGEX, SUPRA_ANTECEDENT_REGEX
from eyecite.tokenizers import Tokenizer, default_tokenizer
diff --git a/eyecite/resolve.py b/eyecite/resolve.py
index 440dd0ba..01d3dfb5 100644
--- a/eyecite/resolve.py
+++ b/eyecite/resolve.py
@@ -7,11 +7,11 @@
FullCaseCitation,
FullCitation,
IdCitation,
+ ReferenceCitation,
Resource,
ResourceType,
ShortCaseCitation,
SupraCitation,
- ReferenceCitation,
)
from eyecite.utils import strip_punct
diff --git a/eyecite/test_factories.py b/eyecite/test_factories.py
index a14aa077..f736a95a 100644
--- a/eyecite/test_factories.py
+++ b/eyecite/test_factories.py
@@ -1,18 +1,18 @@
from eyecite.helpers import get_year
from eyecite.models import (
+ CaseReferenceToken,
CitationToken,
FullCaseCitation,
FullJournalCitation,
FullLawCitation,
IdCitation,
IdToken,
+ ReferenceCitation,
SectionToken,
ShortCaseCitation,
SupraCitation,
SupraToken,
UnknownCitation,
- ReferenceCitation,
- CaseReferenceToken,
)
from eyecite.tokenizers import EDITIONS_LOOKUP
From 95fdb1b0f3635fdf808b504daa0d723b5eca4b97 Mon Sep 17 00:00:00 2001
From: William Palin
Date: Fri, 10 Jan 2025 16:14:13 -0500
Subject: [PATCH 07/40] chore(eyecite): lint
---
eyecite/find.py | 7 +++----
eyecite/models.py | 4 +---
eyecite/resolve.py | 2 --
3 files changed, 4 insertions(+), 9 deletions(-)
diff --git a/eyecite/find.py b/eyecite/find.py
index 05f70ddb..e22d204c 100644
--- a/eyecite/find.py
+++ b/eyecite/find.py
@@ -58,7 +58,7 @@ def get_citations(
return joke_cite
words, citation_tokens = tokenizer.tokenize(plain_text)
- citations = []
+ citations: list[CitationBase] = []
for i, token in citation_tokens:
citation: CitationBase
@@ -73,9 +73,8 @@ def get_citations(
if citation_token.short:
citation = _extract_shortform_citation(words, i)
else:
- citation: FullCitation = _extract_full_citation(words, i)
- if citations and citation.is_parallel_citation(citations[-1]):
- # Check if parallel citation and merge plaintiff/defendants
+ citation = _extract_full_citation(words, i)
+ if citations and isinstance(citation, FullCitation):
citation.is_parallel_citation(citations[-1])
# Check for reference citations that follow a full citation
diff --git a/eyecite/models.py b/eyecite/models.py
index 49cc3f10..81609220 100644
--- a/eyecite/models.py
+++ b/eyecite/models.py
@@ -304,13 +304,10 @@ class FullCitation(ResourceCitation):
def is_parallel_citation(self, preceding: CitationBase):
"""Check if preceding citation is parallel
- If parallel match plaintiff and defendant metadata
-
Args:
preceding (): The previous citation found
Returns: None
-
"""
is_parallel = (
self.full_span_start == preceding.full_span_start
@@ -318,6 +315,7 @@ def is_parallel_citation(self, preceding: CitationBase):
and isinstance(preceding, FullCaseCitation)
)
if is_parallel:
+ # if parallel merge plaintiff/defendant data
self.metadata.defendant = preceding.metadata.defendant
self.metadata.plaintiff = preceding.metadata.plaintiff
diff --git a/eyecite/resolve.py b/eyecite/resolve.py
index 01d3dfb5..ea3ab70e 100644
--- a/eyecite/resolve.py
+++ b/eyecite/resolve.py
@@ -91,8 +91,6 @@ def _filter_by_matching_plaintiff_or_defendant(
) -> Optional[ResourceType]:
"""Filter out any impossible reference citations"""
matches: List[ResourceType] = []
- plaintiff: str = strip_punct(plaintiff or "")
- defendant: str = strip_punct(defendant or "")
for full_citation, resource in resolved_full_cites:
if not isinstance(full_citation, FullCaseCitation):
continue
From b239171437c7efeb79319285eac8054fad5246c0 Mon Sep 17 00:00:00 2001
From: William Palin
Date: Fri, 10 Jan 2025 16:21:10 -0500
Subject: [PATCH 08/40] chore(eyecite): lint
---
eyecite/resolve.py | 22 ++++++++++++----------
1 file changed, 12 insertions(+), 10 deletions(-)
diff --git a/eyecite/resolve.py b/eyecite/resolve.py
index ea3ab70e..a8986e71 100644
--- a/eyecite/resolve.py
+++ b/eyecite/resolve.py
@@ -91,20 +91,22 @@ def _filter_by_matching_plaintiff_or_defendant(
) -> Optional[ResourceType]:
"""Filter out any impossible reference citations"""
matches: List[ResourceType] = []
+
for full_citation, resource in resolved_full_cites:
if not isinstance(full_citation, FullCaseCitation):
continue
- if (
- full_citation.metadata.defendant
- and defendant in full_citation.metadata.defendant
- ):
- matches.append(resource)
- elif (
- full_citation.metadata.plaintiff
- and plaintiff in full_citation.metadata.plaintiff
- ):
+ defendant_match = (
+ defendant
+ and full_citation.metadata.defendant
+ and defendant in full_citation.metadata.defendant
+ )
+ plaintiff_match = (
+ plaintiff
+ and full_citation.metadata.plaintiff
+ and plaintiff in full_citation.metadata.plaintiff
+ )
+ if defendant_match or plaintiff_match:
matches.append(resource)
-
# Remove duplicates and only accept if one candidate remains
matches = list(set(matches))
return matches[0] if len(matches) == 1 else None
From a785288672fec4a840ebc8f43de604a0a11adae0 Mon Sep 17 00:00:00 2001
From: William Palin
Date: Fri, 10 Jan 2025 16:23:30 -0500
Subject: [PATCH 09/40] chore(eyecite): lint
---
eyecite/resolve.py | 12 ++++++------
1 file changed, 6 insertions(+), 6 deletions(-)
diff --git a/eyecite/resolve.py b/eyecite/resolve.py
index a8986e71..6ec55902 100644
--- a/eyecite/resolve.py
+++ b/eyecite/resolve.py
@@ -96,14 +96,14 @@ def _filter_by_matching_plaintiff_or_defendant(
if not isinstance(full_citation, FullCaseCitation):
continue
defendant_match = (
- defendant
- and full_citation.metadata.defendant
- and defendant in full_citation.metadata.defendant
+ defendant
+ and full_citation.metadata.defendant
+ and defendant in full_citation.metadata.defendant
)
plaintiff_match = (
- plaintiff
- and full_citation.metadata.plaintiff
- and plaintiff in full_citation.metadata.plaintiff
+ plaintiff
+ and full_citation.metadata.plaintiff
+ and plaintiff in full_citation.metadata.plaintiff
)
if defendant_match or plaintiff_match:
matches.append(resource)
From 9ea2169685f2a12c5c238df7731fd2abf108ca06 Mon Sep 17 00:00:00 2001
From: William Palin
Date: Tue, 14 Jan 2025 12:06:03 -0500
Subject: [PATCH 10/40] fix(find.py): Add fixes for over id'ing reference
citations
Fix typos
---
eyecite/find.py | 32 +++++++++++++++++++++++---------
eyecite/helpers.py | 21 ++++++++++++++++-----
eyecite/resolve.py | 3 ++-
tests/test_AnnotateTest.py | 2 +-
tests/test_FindTest.py | 10 ++++++++++
5 files changed, 52 insertions(+), 16 deletions(-)
diff --git a/eyecite/find.py b/eyecite/find.py
index e22d204c..1fbfbd94 100644
--- a/eyecite/find.py
+++ b/eyecite/find.py
@@ -4,9 +4,9 @@
from eyecite.helpers import (
disambiguate_reporters,
extract_pin_cite,
+ filter_citations,
joke_cite,
match_on_tokens,
- order_citations,
)
from eyecite.models import (
CaseReferenceToken,
@@ -110,7 +110,7 @@ def get_citations(
citations.append(citation)
- citations = order_citations(citations)
+ citations = filter_citations(citations)
# Remove citations with multiple reporter candidates where we couldn't
# guess correct reporter
@@ -139,13 +139,23 @@ def _extract_reference_citations(
if not citation.metadata.defendant:
# Skip if no defendant exists
return []
- escaped_plaintiff = re.escape(citation.metadata.plaintiff or "")
- escaped_defendant = re.escape(citation.metadata.defendant)
+ plaintiff_regex = (
+ rf"(?P{re.escape(citation.metadata.plaintiff)})"
+ if citation.metadata.plaintiff
+ else ""
+ )
+ defendant_regex = (
+ rf"(?P{re.escape(citation.metadata.defendant)})"
+ if citation.metadata.defendant
+ else ""
+ )
+
+ # Combine the components if they are not empty
+ combined_regex_parts = "|".join(
+ filter(None, [plaintiff_regex, defendant_regex])
+ )
pin_cite_regex = (
- rf"\b(?:"
- rf"(?P{escaped_plaintiff})|"
- rf"(?P{escaped_defendant})\s?"
- rf")\s+at\s+(?P\d{{1,5}})?\b"
+ rf"\b(?:{combined_regex_parts})\s+at\s+(?P\d{{1,5}})\b"
)
pin_cite_pattern = re.compile(pin_cite_regex)
@@ -169,7 +179,11 @@ def _extract_reference_citations(
full_span_end=end + offset,
index=0,
metadata={
- "plaintiff": match.group("plaintiff"),
+ "plaintiff": (
+ match.group("plaintiff")
+ if "plaintiff" in match.groupdict()
+ else None
+ ),
"defendant": match.group("defendant"),
"pin_cite": match.group("page"),
},
diff --git a/eyecite/helpers.py b/eyecite/helpers.py
index 50285d65..a1778b64 100644
--- a/eyecite/helpers.py
+++ b/eyecite/helpers.py
@@ -307,14 +307,25 @@ def disambiguate_reporters(
]
-def order_citations(citations: List[CitationBase]) -> List[CitationBase]:
- """
- Order citations that may have reference citations out or sequential order
+def filter_citations(citations: List[CitationBase]) -> List[CitationBase]:
+ """Filter and order citations that may have reference cites out of order
:param citations: List of citation`
- :return: Sorted citations
+ :return: Sorted and filtered citations
"""
- return sorted(citations, key=lambda citation: citation.span())
+ filtered_citations = []
+ sorted_citations = sorted(citations, key=lambda citation: citation.span())
+ for citation in sorted_citations:
+ if filtered_citations:
+ last_citation = filtered_citations[-1]
+ last_span = last_citation.span()
+ current_span = citation.span()
+
+ if current_span[0] <= last_span[1]:
+ # Remove overlapping citations that can occur in edge cases
+ continue
+ filtered_citations.append(citation)
+ return filtered_citations
joke_cite: List[CitationBase] = [
diff --git a/eyecite/resolve.py b/eyecite/resolve.py
index 6ec55902..7f09ccfb 100644
--- a/eyecite/resolve.py
+++ b/eyecite/resolve.py
@@ -213,7 +213,8 @@ def _resolve_reference_citation(
reference_citation: ReferenceCitation,
resolved_full_cites: ResolvedFullCites,
) -> Optional[ResourceType]:
- """
+ """Resolve reference citations
+
Try to resolve reference citations by checking whether their is only one
full citation that appears with either the defendant or plaintiff
field of any of the previously resolved full citations.
diff --git a/tests/test_AnnotateTest.py b/tests/test_AnnotateTest.py
index a7da5888..e61c7af3 100644
--- a/tests/test_AnnotateTest.py
+++ b/tests/test_AnnotateTest.py
@@ -47,7 +47,7 @@ def lower_annotator(before, text, after):
"<0>1 U.S. 10>. Foo v. Bar, <1>supra at 21>.",
[],
),
- # Refernce cite
+ # Reference cite
(
"Foo v. Bar 1 U.S. 1. In Foo at 2.",
"Foo v. Bar <0>1 U.S. 10>. In <1>Foo at 21>.",
diff --git a/tests/test_FindTest.py b/tests/test_FindTest.py
index c04d7bbe..fb035bfb 100644
--- a/tests/test_FindTest.py
+++ b/tests/test_FindTest.py
@@ -471,6 +471,16 @@ def test_find_citations(self):
reference_citation('Foo at 62',
metadata={'defendant': 'Foo',
"pin_cite": "62"})]),
+ # Test reference citation that contains at
+ ('In re Foo 1 Mass. 12, 347-348. something something, in at we see that',
+ [case_citation(page='12', reporter="Mass.", volume="1",
+ metadata={'defendant': 'Foo', 'pin_cite': '347-348'})]),
+ # Test U.S. as plaintiff with reference citations
+ ('U.S. v. Boch Oldsmobile, Inc., 909 F.2d 657, 660 (1st Cir.1990); Piper Aircraft, 454 U.S. at 241',
+ [case_citation(page='657', reporter="F.2d", volume="909",
+ metadata={'plaintiff': 'U.S.', 'defendant': 'Boch Oldsmobile, Inc.', 'pin_cite': '660'}),
+ case_citation(volume="454", page='241', reporter_found='U.S.', short=True,
+ metadata={'antecedent_guess': 'Aircraft', 'court': "scotus", 'pin_cite': None})]),
# Test reference citation after an id citation
('we said in Morton v. Mancari, 417 U. S. 535, 552 (1974) “Literally every piece ....”. “asisovereign tribal entities . . . .” Id. In Mancari at 665',
[case_citation(page='535', year=1974, volume="417",
From 6507d01f3682e4000ea1816560495be3567bfb20 Mon Sep 17 00:00:00 2001
From: William Palin
Date: Tue, 14 Jan 2025 13:17:47 -0500
Subject: [PATCH 11/40] fix(tests): Lint and test fixes
---
eyecite/helpers.py | 2 +-
tests/test_FindTest.py | 2 +-
2 files changed, 2 insertions(+), 2 deletions(-)
diff --git a/eyecite/helpers.py b/eyecite/helpers.py
index a1778b64..c3ab843e 100644
--- a/eyecite/helpers.py
+++ b/eyecite/helpers.py
@@ -313,7 +313,7 @@ def filter_citations(citations: List[CitationBase]) -> List[CitationBase]:
:param citations: List of citation`
:return: Sorted and filtered citations
"""
- filtered_citations = []
+ filtered_citations: List[CitationBase] = []
sorted_citations = sorted(citations, key=lambda citation: citation.span())
for citation in sorted_citations:
if filtered_citations:
diff --git a/tests/test_FindTest.py b/tests/test_FindTest.py
index fb035bfb..fbfc678b 100644
--- a/tests/test_FindTest.py
+++ b/tests/test_FindTest.py
@@ -480,7 +480,7 @@ def test_find_citations(self):
[case_citation(page='657', reporter="F.2d", volume="909",
metadata={'plaintiff': 'U.S.', 'defendant': 'Boch Oldsmobile, Inc.', 'pin_cite': '660'}),
case_citation(volume="454", page='241', reporter_found='U.S.', short=True,
- metadata={'antecedent_guess': 'Aircraft', 'court': "scotus", 'pin_cite': None})]),
+ metadata={'antecedent_guess': 'Aircraft', 'court': "scotus", 'pin_cite': "241"})]),
# Test reference citation after an id citation
('we said in Morton v. Mancari, 417 U. S. 535, 552 (1974) “Literally every piece ....”. “asisovereign tribal entities . . . .” Id. In Mancari at 665',
[case_citation(page='535', year=1974, volume="417",
From 67df9a86c1f83c1eb9d041d2f558c9508822bf8d Mon Sep 17 00:00:00 2001
From: William Palin
Date: Wed, 22 Jan 2025 14:07:00 -0500
Subject: [PATCH 12/40] fix(tests): Make reference citation extraction stricter
Limit the names that can be used to better formatted
plaintiff/defendants
Add tests to show filtering/ordering reference citaitons
And refactor add defendant for edge case where it could
be only whitespace.
typos etc.
---
eyecite/find.py | 68 +++++++++++++++++++-----------------------
eyecite/helpers.py | 6 ++--
eyecite/models.py | 4 ++-
tests/test_FindTest.py | 32 ++++++++++++++++++++
4 files changed, 70 insertions(+), 40 deletions(-)
diff --git a/eyecite/find.py b/eyecite/find.py
index 1fbfbd94..c3cc4c3f 100644
--- a/eyecite/find.py
+++ b/eyecite/find.py
@@ -131,44 +131,46 @@ def _extract_reference_citations(
:param citation: the full case citation found
:param plain_text: the text
- :return: Pincite reference citations
+ :return: Pin cite reference citations
"""
- if not isinstance(citation, FullCaseCitation):
- # Skip if not case law citation
+ if len(plain_text) <= citation.span()[-1]:
return []
- if not citation.metadata.defendant:
- # Skip if no defendant exists
+ if not isinstance(citation, FullCaseCitation):
return []
- plaintiff_regex = (
- rf"(?P{re.escape(citation.metadata.plaintiff)})"
- if citation.metadata.plaintiff
- else ""
- )
- defendant_regex = (
- rf"(?P{re.escape(citation.metadata.defendant)})"
- if citation.metadata.defendant
- else ""
- )
- # Combine the components if they are not empty
- combined_regex_parts = "|".join(
- filter(None, [plaintiff_regex, defendant_regex])
- )
- pin_cite_regex = (
- rf"\b(?:{combined_regex_parts})\s+at\s+(?P\d{{1,5}})\b"
- )
+ def is_valid_name(name: str) -> bool:
+ """Validate name isnt a regex issue
- pin_cite_pattern = re.compile(pin_cite_regex)
- reference_citations = []
- if len(plain_text) <= citation.span()[-1]:
- return []
+ Excludes strings like Co., numbers or lower case strs
+
+ :param name: The name to check
+ :return: True if usable, false if not
+ """
+ return (
+ isinstance(name, str)
+ and len(name) > 2
+ and name[0].isupper()
+ and not name.endswith(".")
+ and not name.isdigit()
+ )
+ regexes = [
+ rf"(?P<{key}>{re.escape(value)})"
+ for key in ["plaintiff", "defendant"]
+ if (value := getattr(citation.metadata, key, None))
+ and is_valid_name(value)
+ ]
+ if not regexes:
+ return []
+ pin_cite_re = (
+ rf"\b(?:{'|'.join(regexes)})\s+at\s+(?P\d{{1,5}})\b"
+ )
+ reference_citations = []
remaining_text = plain_text[citation.span()[-1] :]
offset = citation.span()[-1]
- for match in pin_cite_pattern.finditer(remaining_text):
+ for match in re.compile(pin_cite_re).finditer(remaining_text):
start, end = match.span()
matched_text = match.group(0)
-
reference = ReferenceCitation(
token=CaseReferenceToken(
data=matched_text, start=start + offset, end=end + offset
@@ -178,15 +180,7 @@ def _extract_reference_citations(
full_span_start=start + offset,
full_span_end=end + offset,
index=0,
- metadata={
- "plaintiff": (
- match.group("plaintiff")
- if "plaintiff" in match.groupdict()
- else None
- ),
- "defendant": match.group("defendant"),
- "pin_cite": match.group("page"),
- },
+ metadata=match.groupdict(),
)
reference_citations.append(reference)
return reference_citations
diff --git a/eyecite/helpers.py b/eyecite/helpers.py
index c3ab843e..c5ccfc6b 100644
--- a/eyecite/helpers.py
+++ b/eyecite/helpers.py
@@ -133,9 +133,11 @@ def add_defendant(citation: CaseCitation, words: Tokens) -> None:
break
if start_index:
citation.full_span_start = citation.span()[0] - offset
- citation.metadata.defendant = "".join(
+ defendant = "".join(
str(w) for w in words[start_index : citation.index]
).strip(", ")
+ if defendant.strip():
+ citation.metadata.defendant = defendant
def add_law_metadata(citation: FullLawCitation, words: Tokens) -> None:
@@ -310,7 +312,7 @@ def disambiguate_reporters(
def filter_citations(citations: List[CitationBase]) -> List[CitationBase]:
"""Filter and order citations that may have reference cites out of order
- :param citations: List of citation`
+ :param citations: List of citations
:return: Sorted and filtered citations
"""
filtered_citations: List[CitationBase] = []
diff --git a/eyecite/models.py b/eyecite/models.py
index 81609220..c0fc5e25 100644
--- a/eyecite/models.py
+++ b/eyecite/models.py
@@ -315,7 +315,9 @@ def is_parallel_citation(self, preceding: CitationBase):
and isinstance(preceding, FullCaseCitation)
)
if is_parallel:
- # if parallel merge plaintiff/defendant data
+ # if parallel get plaintiff/defendant data from
+ # the earlier citation, since it won't be on the
+ # parallel one.
self.metadata.defendant = preceding.metadata.defendant
self.metadata.plaintiff = preceding.metadata.plaintiff
diff --git a/tests/test_FindTest.py b/tests/test_FindTest.py
index fbfc678b..b1b81242 100644
--- a/tests/test_FindTest.py
+++ b/tests/test_FindTest.py
@@ -441,6 +441,38 @@ def test_find_citations(self):
[],),
('lorem 111 N. W. 12th St.',
[],),
+ # Eyecite has issue with linebreaks when identifying defendants and
+ # previously could store defendant as only whitespace
+ ('\n rt. denied,\n \n \n 541 U.S. 1085 (2004);\n \n',
+ [case_citation(
+ page='1085',
+ volume="541",
+ reporter="U.S.",
+ year=2004,
+ metadata={'plaintiff': None,
+ 'defendant': None,
+ 'court': 'scotus'
+ }
+ )],
+ {'clean': ['html', 'inline_whitespace']}),
+ # Test filtering overlapping citations - this finds four citations
+ # but should filter down to three
+ ("Miles v. Smith 1 Ga. 1; asdfasdf asd Something v. Else, 1 Miles 3; 1 Miles at 10",
+ [case_citation(page='1',
+ volume="1",
+ reporter="Ga.",
+ metadata={'plaintiff': 'Miles',
+ 'defendant': 'Smith',
+ }),
+ case_citation(page='3',
+ volume="1",
+ reporter="Miles",
+ metadata={'plaintiff': 'Something',
+ 'defendant': 'Else'}
+ ),
+ case_citation(volume="1", page='10', reporter='Miles',
+ short=True,
+ metadata={'pin_cite': '10'})]),
('General Casualty cites as compelling Amick v. Liberty Mut. Ins. Co., 455 A.2d 793 (R.I. 1983). In that case ... Stats, do. See Amick at 795',
[case_citation(page='793',
volume="455",
From b4f6ff246a73e7a96b789eafed58f2645b524bf9 Mon Sep 17 00:00:00 2001
From: William Palin
Date: Wed, 22 Jan 2025 14:21:54 -0500
Subject: [PATCH 13/40] fix(tests): Flake8
---
tests/test_FindTest.py | 33 +++++++++++++++------------------
1 file changed, 15 insertions(+), 18 deletions(-)
diff --git a/tests/test_FindTest.py b/tests/test_FindTest.py
index b1b81242..16de99d3 100644
--- a/tests/test_FindTest.py
+++ b/tests/test_FindTest.py
@@ -451,28 +451,25 @@ def test_find_citations(self):
year=2004,
metadata={'plaintiff': None,
'defendant': None,
- 'court': 'scotus'
- }
- )],
+ 'court': 'scotus'})],
{'clean': ['html', 'inline_whitespace']}),
# Test filtering overlapping citations - this finds four citations
# but should filter down to three
("Miles v. Smith 1 Ga. 1; asdfasdf asd Something v. Else, 1 Miles 3; 1 Miles at 10",
- [case_citation(page='1',
- volume="1",
- reporter="Ga.",
- metadata={'plaintiff': 'Miles',
- 'defendant': 'Smith',
- }),
- case_citation(page='3',
- volume="1",
- reporter="Miles",
- metadata={'plaintiff': 'Something',
- 'defendant': 'Else'}
- ),
- case_citation(volume="1", page='10', reporter='Miles',
- short=True,
- metadata={'pin_cite': '10'})]),
+ [case_citation(page='1',
+ volume="1",
+ reporter="Ga.",
+ metadata={'plaintiff': 'Miles',
+ 'defendant': 'Smith'}),
+ case_citation(page='3',
+ volume="1",
+ reporter="Miles",
+ metadata={'plaintiff': 'Something',
+ 'defendant': 'Else'}
+ ),
+ case_citation(volume="1", page='10', reporter='Miles',
+ short=True,
+ metadata={'pin_cite': '10'})]),
('General Casualty cites as compelling Amick v. Liberty Mut. Ins. Co., 455 A.2d 793 (R.I. 1983). In that case ... Stats, do. See Amick at 795',
[case_citation(page='793',
volume="455",
From 5edc50dae20a53ffafaf7c4bd30724bcaa2de070 Mon Sep 17 00:00:00 2001
From: William Palin
Date: Wed, 22 Jan 2025 16:34:24 -0500
Subject: [PATCH 14/40] chore(filter_citations): Update docstring
---
eyecite/helpers.py | 7 ++++++-
1 file changed, 6 insertions(+), 1 deletion(-)
diff --git a/eyecite/helpers.py b/eyecite/helpers.py
index 996f7894..9ab88f53 100644
--- a/eyecite/helpers.py
+++ b/eyecite/helpers.py
@@ -318,7 +318,12 @@ def disambiguate_reporters(
def filter_citations(citations: List[CitationBase]) -> List[CitationBase]:
- """Filter and order citations that may have reference cites out of order
+ """Filter and order citations, ensuring reference citations are in sequence
+
+ This function resolves rare but possible overlaps between ref. citations
+ and short citations. It also orders all citations by their `citation.span`,
+ as reference citations may be extracted out of order. The final result is a
+ properly sorted list of citations as they appear in the text
:param citations: List of citations
:return: Sorted and filtered citations
From 313dcd66892470ccd3a717d8c682d9f91e5a3a66 Mon Sep 17 00:00:00 2001
From: rachlllg
Date: Mon, 27 Jan 2025 14:52:31 -0800
Subject: [PATCH 15/40] update_tutorial
---
TUTORIAL.ipynb | 151 +++++++++++++++++++++++++++----------------------
1 file changed, 82 insertions(+), 69 deletions(-)
diff --git a/TUTORIAL.ipynb b/TUTORIAL.ipynb
index 2bfa0c32..ef358901 100644
--- a/TUTORIAL.ipynb
+++ b/TUTORIAL.ipynb
@@ -54,7 +54,7 @@
"metadata": {},
"outputs": [],
"source": [
- "opinion_url = 'https://www.courtlistener.com/api/rest/v3/opinions/1741/'\n",
+ "opinion_url = 'https://www.courtlistener.com/api/rest/v4/opinions/1741/'\n",
"opinion_text = requests.get(opinion_url).json()['plain_text']"
]
},
@@ -68,31 +68,31 @@
"name": "stdout",
"output_type": "stream",
"text": [
- "(Slip Opinion) OCTOBER TERM, 2009 1\r\n",
- "\r\n",
- " Syllabus\r\n",
- "\r\n",
- " NOTE: Where it is feasible, a syllabus (headnote) will be released, as is\r\n",
- " being done in connection with this case, at the time the opinion is issued.\r\n",
- " The syllabus constitutes no part of the opinion of the Court but has been\r\n",
- " prepared by the Reporter of Decisions for the convenience of the reader.\r\n",
- " See United States v. Detroit Timber & Lumber Co., 200 U. S. 321, 337.\r\n",
- "\r\n",
- "\r\n",
- "SUPREME COURT OF THE UNITED STATES\r\n",
- "\r\n",
- " Syllabus\r\n",
- "\r\n",
- " CITIZENS UNITED v. FEDERAL ELECTION\r\n",
- "\r\n",
- " COMMISSION \r\n",
- "\r\n",
- "\r\n",
- "APPEAL FROM THE UNITED STATES DISTRICT COURT FOR THE\r\n",
- " DISTRICT OF COLUMBIA\r\n",
- "\r\n",
- "No. 08–205. Argued March 24, 2009—Reargued September 9, 2009––\r\n",
- " Decided January 21, 2010\r\n",
+ "(Slip Opinion) OCTOBER TERM, 2009 1\n",
+ "\n",
+ " Syllabus\n",
+ "\n",
+ " NOTE: Where it is feasible, a syllabus (headnote) will be released, as is\n",
+ " being done in connection with this case, at the time the opinion is issued.\n",
+ " The syllabus constitutes no part of the opinion of the Court but has been\n",
+ " prepared by the Reporter of Decisions for the convenience of the reader.\n",
+ " See United States v. Detroit Timber & Lumber Co., 200 U. S. 321, 337.\n",
+ "\n",
+ "\n",
+ "SUPREME COURT OF THE UNITED STATES\n",
+ "\n",
+ " Syllabus\n",
+ "\n",
+ " CITIZENS UNITED v. FEDERAL ELECTION\n",
+ "\n",
+ " COMMISSION \n",
+ "\n",
+ "\n",
+ "APPEAL FROM THE UNITED STATES DISTRICT COURT FOR THE\n",
+ " DISTRICT OF COLUMBIA\n",
+ "\n",
+ "No. 08–205. Argued March 24, 2009—Reargued September 9, 2009––\n",
+ " Decided January 21, 2010\n",
"As amended by §203 of the Bipartisan Campaign Reform Act of\n"
]
}
@@ -163,6 +163,27 @@
"Next, we'll extract the citations using a custom tokenizer. Unlike the default tokenizer, here we'll use our hyperscan tokenizer for much faster extraction, which works by automatically pre-compiling and caching a regular expression database on first use. Because of this one-time pre-compilation stage, the first use of this tokenizer is slow:"
]
},
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "id": "9f503e9d-7432-4454-9864-bd38b5237d93",
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Requirement already satisfied: hyperscan in /Users/rachelgao/anaconda3/envs/test/lib/python3.10/site-packages (0.7.8)\n"
+ ]
+ }
+ ],
+ "source": [
+ "# pip install hyperscan if not already installed\n",
+ "!pip install hyperscan"
+ ]
+ },
{
"cell_type": "code",
"execution_count": 7,
@@ -173,8 +194,8 @@
"name": "stdout",
"output_type": "stream",
"text": [
- "CPU times: user 14.9 s, sys: 301 ms, total: 15.2 s\n",
- "Wall time: 15.7 s\n"
+ "CPU times: user 962 ms, sys: 20.9 ms, total: 983 ms\n",
+ "Wall time: 1.01 s\n"
]
}
],
@@ -202,8 +223,8 @@
"name": "stdout",
"output_type": "stream",
"text": [
- "CPU times: user 183 ms, sys: 5.74 ms, total: 189 ms\n",
- "Wall time: 198 ms\n"
+ "CPU times: user 829 ms, sys: 3.07 ms, total: 832 ms\n",
+ "Wall time: 831 ms\n"
]
}
],
@@ -230,7 +251,7 @@
"name": "stdout",
"output_type": "stream",
"text": [
- "Extracted 1005 citations.\n",
+ "Extracted 1113 citations.\n",
"\n",
"First citation:\n",
" FullCaseCitation('200 U. S. 321', groups={'volume': '200', 'reporter': 'U. S.', 'page': '321'}, metadata=FullCaseCitation.Metadata(parenthetical=None, pin_cite='337', year=None, court='scotus', plaintiff='States', defendant='Detroit Timber & Lumber Co.', extra=None))\n"
@@ -286,7 +307,7 @@
"name": "stdout",
"output_type": "stream",
"text": [
- "Resolved citations into 176 groups.\n",
+ "Resolved citations into 284 groups.\n",
"\n"
]
}
@@ -314,12 +335,12 @@
"output_type": "stream",
"text": [
"This case is cited lots of times:\n",
- "FullCaseCitation('479 U. S. 238', groups={'volume': '479', 'reporter': 'U. S.', 'page': '238'}, metadata=FullCaseCitation.Metadata(parenthetical='MCFL', pin_cite='249', year='1986', court='scotus', plaintiff='Comm’n', defendant='Massachusetts Citizens for Life, Inc.', extra=None))\n",
+ "FullCaseCitation('558 U. S. ____', groups={'volume': '558', 'reporter': 'U. S.', 'page': None}, metadata=FullCaseCitation.Metadata(parenthetical=None, pin_cite=None, year='2010', court='scotus', plaintiff=None, defendant=None, extra=None))\n",
"\n",
- "23 times, in fact.\n",
+ "1 times, in fact.\n",
"\n",
"Here are all of its citations:\n",
- "[FullCaseCitation('479 U. S. 238', groups={'volume': '479', 'reporter': 'U. S.', 'page': '238'}, metadata=FullCaseCitation.Metadata(parenthetical='MCFL', pin_cite='249', year='1986', court='scotus', plaintiff='Comm’n', defendant='Massachusetts Citizens for Life, Inc.', extra=None)), ShortCaseCitation('479 U. S., at 257', groups={'volume': '479', 'reporter': 'U. S.', 'page': '257'}, metadata=ShortCaseCitation.Metadata(parenthetical=None, pin_cite='257', year=None, court='scotus', antecedent_guess='MCFL')), ShortCaseCitation('479 U. S., at 260', groups={'volume': '479', 'reporter': 'U. S.', 'page': '260'}, metadata=ShortCaseCitation.Metadata(parenthetical=None, pin_cite='260', year=None, court='scotus', antecedent_guess='MCFL')), ShortCaseCitation('479 U. S., at 262', groups={'volume': '479', 'reporter': 'U. S.', 'page': '262'}, metadata=ShortCaseCitation.Metadata(parenthetical=None, pin_cite='262', year=None, court='scotus', antecedent_guess='MCFL')), FullCaseCitation('479 U. S. 238', groups={'volume': '479', 'reporter': 'U. S.', 'page': '238'}, metadata=FullCaseCitation.Metadata(parenthetical=None, pin_cite=None, year='1986', court='scotus', plaintiff='Comm’n', defendant='Massachusetts Citizens for Life, Inc.', extra=None)), FullCaseCitation('479 U. S. 238', groups={'volume': '479', 'reporter': 'U. S.', 'page': '238'}, metadata=FullCaseCitation.Metadata(parenthetical='MCFL', pin_cite=None, year='1986', court='scotus', plaintiff='FEC', defendant='Massachusetts Citizens for Life, Inc.', extra=None)), FullCaseCitation('479 U. S. 238', groups={'volume': '479', 'reporter': 'U. S.', 'page': '238'}, metadata=FullCaseCitation.Metadata(parenthetical=None, pin_cite=None, year=None, court='scotus', plaintiff=None, defendant=None, extra=None)), FullCaseCitation('479 U. S. 238', groups={'volume': '479', 'reporter': 'U. S.', 'page': '238'}, metadata=FullCaseCitation.Metadata(parenthetical=None, pin_cite=None, year=None, court='scotus', plaintiff=None, defendant=None, extra=None)), IdCitation('id.,', metadata=IdCitation.Metadata(parenthetical='quoting NRWC, 459 U. S., at 209–210', pin_cite='at 256')), ShortCaseCitation('479 U. S., at 257', groups={'volume': '479', 'reporter': 'U. S.', 'page': '257'}, metadata=ShortCaseCitation.Metadata(parenthetical='internal quotation marks omitted', pin_cite='257', year=None, court='scotus', antecedent_guess=None)), IdCitation('id.,', metadata=IdCitation.Metadata(parenthetical='internal quotation marks omitted', pin_cite='at 260')), IdCitation('id.,', metadata=IdCitation.Metadata(parenthetical=None, pin_cite='at 257')), ShortCaseCitation('479 U. S., at 268', groups={'volume': '479', 'reporter': 'U. S.', 'page': '268'}, metadata=ShortCaseCitation.Metadata(parenthetical='opinion of Rehnquist, C. J.', pin_cite='268', year=None, court='scotus', antecedent_guess=None)), ShortCaseCitation('479 U. S., at 257', groups={'volume': '479', 'reporter': 'U. S.', 'page': '257'}, metadata=ShortCaseCitation.Metadata(parenthetical=None, pin_cite='257', year=None, court='scotus', antecedent_guess='MCFL')), ShortCaseCitation('479 U. S., at 264', groups={'volume': '479', 'reporter': 'U. S.', 'page': '264'}, metadata=ShortCaseCitation.Metadata(parenthetical=None, pin_cite='264', year=None, court='scotus', antecedent_guess=None)), IdCitation('Ibid.', metadata=IdCitation.Metadata(parenthetical=None, pin_cite=None)), IdCitation('Ibid.', metadata=IdCitation.Metadata(parenthetical=None, pin_cite=None)), ShortCaseCitation('479 U. S., at 259', groups={'volume': '479', 'reporter': 'U. S.', 'page': '259'}, metadata=ShortCaseCitation.Metadata(parenthetical=None, pin_cite='259, n. 12', year=None, court='scotus', antecedent_guess='MCFL')), ShortCaseCitation('479 U. S., at 258', groups={'volume': '479', 'reporter': 'U. S.', 'page': '258'}, metadata=ShortCaseCitation.Metadata(parenthetical=None, pin_cite='258', year=None, court='scotus', antecedent_guess='MCFL')), ShortCaseCitation('479 U. S., at 258', groups={'volume': '479', 'reporter': 'U. S.', 'page': '258'}, metadata=ShortCaseCitation.Metadata(parenthetical=None, pin_cite='258', year=None, court='scotus', antecedent_guess='MCFL')), FullCaseCitation('479 U. S. 238', groups={'volume': '479', 'reporter': 'U. S.', 'page': '238'}, metadata=FullCaseCitation.Metadata(parenthetical=None, pin_cite='264', year='1986', court='scotus', plaintiff=None, defendant='MCFL', extra=None)), ShortCaseCitation('479 U. S., at 259', groups={'volume': '479', 'reporter': 'U. S.', 'page': '259'}, metadata=ShortCaseCitation.Metadata(parenthetical=None, pin_cite='259', year=None, court='scotus', antecedent_guess='MCFL')), ShortCaseCitation('479 U. S., at 258', groups={'volume': '479', 'reporter': 'U. S.', 'page': '258'}, metadata=ShortCaseCitation.Metadata(parenthetical=None, pin_cite='258', year=None, court='scotus', antecedent_guess='MCFL'))]\n"
+ "[FullCaseCitation('558 U. S. ____', groups={'volume': '558', 'reporter': 'U. S.', 'page': None}, metadata=FullCaseCitation.Metadata(parenthetical=None, pin_cite=None, year='2010', court='scotus', plaintiff=None, defendant=None, extra=None))]\n"
]
}
],
@@ -435,31 +456,31 @@
"name": "stdout",
"output_type": "stream",
"text": [
- "(Slip Opinion) OCTOBER TERM, 2009 1\r\n",
- "\r\n",
- " Syllabus\r\n",
- "\r\n",
- " NOTE: Where it is feasible, a syllabus (headnote) will be released, as is\r\n",
- " being done in connection with this case, at the time the opinion is issued.\r\n",
- " The syllabus constitutes no part of the opinion of the Court but has been\r\n",
- " prepared by the Reporter of Decisions for the convenience of the reader.\r\n",
- " See United States v. Detroit Timber & Lumber Co., 200 U. S. 321, 337.\r\n",
- "\r\n",
- "\r\n",
- "SUPREME COURT OF THE UNITED STATES\r\n",
- "\r\n",
- " Syllabus\r\n",
- "\r\n",
- " CITIZENS UNITED v. FEDERAL ELECTION\r\n",
- "\r\n",
- " COMMISSION \r\n",
- "\r\n",
- "\r\n",
- "APPEAL FROM THE UNITED STATES DISTRICT COURT FOR THE\r\n",
- " DISTRICT OF COLUMBIA\r\n",
- "\r\n",
- "No. 08–205. Argued March 24, 2009—Reargued September 9, 2009––\r\n",
- " Decided January 21, 2010\r\n",
+ "(Slip Opinion) OCTOBER TERM, 2009 1\n",
+ "\n",
+ " Syllabus\n",
+ "\n",
+ " NOTE: Where it is feasible, a syllabus (headnote) will be released, as is\n",
+ " being done in connection with this case, at the time the opinion is issued.\n",
+ " The syllabus constitutes no part of the opinion of the Court but has been\n",
+ " prepared by the Reporter of Decisions for the convenience of the reader.\n",
+ " See United States v. Detroit Timber & Lumber Co., 200 U. S. 321, 337.\n",
+ "\n",
+ "\n",
+ "SUPREME COURT OF THE UNITED STATES\n",
+ "\n",
+ " Syllabus\n",
+ "\n",
+ " CITIZENS UNITED v. FEDERAL ELECTION\n",
+ "\n",
+ " COMMISSION \n",
+ "\n",
+ "\n",
+ "APPEAL FROM THE UNITED STATES DISTRICT COURT FOR THE\n",
+ " DISTRICT OF COLUMBIA\n",
+ "\n",
+ "No. 08–205. Argued March 24, 2009—Reargued September 9, 2009––\n",
+ " Decided January 21, 2010\n",
"As amended by §2\n"
]
}
@@ -475,14 +496,6 @@
"source": [
"Nice!"
]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "3ea3af4c",
- "metadata": {},
- "outputs": [],
- "source": []
}
],
"metadata": {
@@ -501,7 +514,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.8.5"
+ "version": "3.10.16"
}
},
"nbformat": 4,
From 27264b8c06492c7dd3e777915583fac6fa21e268 Mon Sep 17 00:00:00 2001
From: rachlllg
Date: Mon, 27 Jan 2025 16:16:26 -0800
Subject: [PATCH 16/40] fix_permissions
---
.github/workflows/benchmark.yml | 4 ++++
1 file changed, 4 insertions(+)
diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
index a6db91e5..a0566b64 100644
--- a/.github/workflows/benchmark.yml
+++ b/.github/workflows/benchmark.yml
@@ -12,6 +12,8 @@ jobs:
name: PR comment
if: github.event_name == 'pull_request'
runs-on: ubuntu-latest
+ permissions:
+ pull-requests: write
steps:
#----------------------------------------------
# check-out repo and set-up python
@@ -159,6 +161,8 @@ jobs:
name: Reporters-DB-Dipatch
if: github.event_name == 'repository_dispatch'
runs-on: ubuntu-latest
+ permissions:
+ pull-requests: write
steps:
#----------------------------------------------
# check-out repo and set-up python
From 2079bece2b6e472c16bb07a896a40e74da2c7914 Mon Sep 17 00:00:00 2001
From: rachlllg
Date: Mon, 27 Jan 2025 16:21:32 -0800
Subject: [PATCH 17/40] clean_up_files
---
.github/workflows/benchmark.yml | 4 -
TUTORIAL.ipynb | 150 ++++++++++++++++----------------
2 files changed, 74 insertions(+), 80 deletions(-)
diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
index a0566b64..a6db91e5 100644
--- a/.github/workflows/benchmark.yml
+++ b/.github/workflows/benchmark.yml
@@ -12,8 +12,6 @@ jobs:
name: PR comment
if: github.event_name == 'pull_request'
runs-on: ubuntu-latest
- permissions:
- pull-requests: write
steps:
#----------------------------------------------
# check-out repo and set-up python
@@ -161,8 +159,6 @@ jobs:
name: Reporters-DB-Dipatch
if: github.event_name == 'repository_dispatch'
runs-on: ubuntu-latest
- permissions:
- pull-requests: write
steps:
#----------------------------------------------
# check-out repo and set-up python
diff --git a/TUTORIAL.ipynb b/TUTORIAL.ipynb
index ef358901..03d9aae3 100644
--- a/TUTORIAL.ipynb
+++ b/TUTORIAL.ipynb
@@ -68,31 +68,31 @@
"name": "stdout",
"output_type": "stream",
"text": [
- "(Slip Opinion) OCTOBER TERM, 2009 1\n",
- "\n",
- " Syllabus\n",
- "\n",
- " NOTE: Where it is feasible, a syllabus (headnote) will be released, as is\n",
- " being done in connection with this case, at the time the opinion is issued.\n",
- " The syllabus constitutes no part of the opinion of the Court but has been\n",
- " prepared by the Reporter of Decisions for the convenience of the reader.\n",
- " See United States v. Detroit Timber & Lumber Co., 200 U. S. 321, 337.\n",
- "\n",
- "\n",
- "SUPREME COURT OF THE UNITED STATES\n",
- "\n",
- " Syllabus\n",
- "\n",
- " CITIZENS UNITED v. FEDERAL ELECTION\n",
- "\n",
- " COMMISSION \n",
- "\n",
- "\n",
- "APPEAL FROM THE UNITED STATES DISTRICT COURT FOR THE\n",
- " DISTRICT OF COLUMBIA\n",
- "\n",
- "No. 08–205. Argued March 24, 2009—Reargued September 9, 2009––\n",
- " Decided January 21, 2010\n",
+ "(Slip Opinion) OCTOBER TERM, 2009 1\r\n",
+ "\r\n",
+ " Syllabus\r\n",
+ "\r\n",
+ " NOTE: Where it is feasible, a syllabus (headnote) will be released, as is\r\n",
+ " being done in connection with this case, at the time the opinion is issued.\r\n",
+ " The syllabus constitutes no part of the opinion of the Court but has been\r\n",
+ " prepared by the Reporter of Decisions for the convenience of the reader.\r\n",
+ " See United States v. Detroit Timber & Lumber Co., 200 U. S. 321, 337.\r\n",
+ "\r\n",
+ "\r\n",
+ "SUPREME COURT OF THE UNITED STATES\r\n",
+ "\r\n",
+ " Syllabus\r\n",
+ "\r\n",
+ " CITIZENS UNITED v. FEDERAL ELECTION\r\n",
+ "\r\n",
+ " COMMISSION \r\n",
+ "\r\n",
+ "\r\n",
+ "APPEAL FROM THE UNITED STATES DISTRICT COURT FOR THE\r\n",
+ " DISTRICT OF COLUMBIA\r\n",
+ "\r\n",
+ "No. 08–205. Argued March 24, 2009—Reargued September 9, 2009––\r\n",
+ " Decided January 21, 2010\r\n",
"As amended by §203 of the Bipartisan Campaign Reform Act of\n"
]
}
@@ -165,23 +165,13 @@
},
{
"cell_type": "code",
- "execution_count": 6,
- "id": "9f503e9d-7432-4454-9864-bd38b5237d93",
- "metadata": {
- "scrolled": true
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Requirement already satisfied: hyperscan in /Users/rachelgao/anaconda3/envs/test/lib/python3.10/site-packages (0.7.8)\n"
- ]
- }
- ],
+ "execution_count": null,
+ "id": "1384d75b",
+ "metadata": {},
+ "outputs": [],
"source": [
- "# pip install hyperscan if not already installed\n",
- "!pip install hyperscan"
+ "# install hyperscan if not already installed\n",
+ "# !pip install hyperscan"
]
},
{
@@ -194,8 +184,8 @@
"name": "stdout",
"output_type": "stream",
"text": [
- "CPU times: user 962 ms, sys: 20.9 ms, total: 983 ms\n",
- "Wall time: 1.01 s\n"
+ "CPU times: user 14.9 s, sys: 301 ms, total: 15.2 s\n",
+ "Wall time: 15.7 s\n"
]
}
],
@@ -223,8 +213,8 @@
"name": "stdout",
"output_type": "stream",
"text": [
- "CPU times: user 829 ms, sys: 3.07 ms, total: 832 ms\n",
- "Wall time: 831 ms\n"
+ "CPU times: user 183 ms, sys: 5.74 ms, total: 189 ms\n",
+ "Wall time: 198 ms\n"
]
}
],
@@ -251,7 +241,7 @@
"name": "stdout",
"output_type": "stream",
"text": [
- "Extracted 1113 citations.\n",
+ "Extracted 1005 citations.\n",
"\n",
"First citation:\n",
" FullCaseCitation('200 U. S. 321', groups={'volume': '200', 'reporter': 'U. S.', 'page': '321'}, metadata=FullCaseCitation.Metadata(parenthetical=None, pin_cite='337', year=None, court='scotus', plaintiff='States', defendant='Detroit Timber & Lumber Co.', extra=None))\n"
@@ -307,7 +297,7 @@
"name": "stdout",
"output_type": "stream",
"text": [
- "Resolved citations into 284 groups.\n",
+ "Resolved citations into 176 groups.\n",
"\n"
]
}
@@ -335,12 +325,12 @@
"output_type": "stream",
"text": [
"This case is cited lots of times:\n",
- "FullCaseCitation('558 U. S. ____', groups={'volume': '558', 'reporter': 'U. S.', 'page': None}, metadata=FullCaseCitation.Metadata(parenthetical=None, pin_cite=None, year='2010', court='scotus', plaintiff=None, defendant=None, extra=None))\n",
+ "FullCaseCitation('479 U. S. 238', groups={'volume': '479', 'reporter': 'U. S.', 'page': '238'}, metadata=FullCaseCitation.Metadata(parenthetical='MCFL', pin_cite='249', year='1986', court='scotus', plaintiff='Comm’n', defendant='Massachusetts Citizens for Life, Inc.', extra=None))\n",
"\n",
- "1 times, in fact.\n",
+ "23 times, in fact.\n",
"\n",
"Here are all of its citations:\n",
- "[FullCaseCitation('558 U. S. ____', groups={'volume': '558', 'reporter': 'U. S.', 'page': None}, metadata=FullCaseCitation.Metadata(parenthetical=None, pin_cite=None, year='2010', court='scotus', plaintiff=None, defendant=None, extra=None))]\n"
+ "[FullCaseCitation('479 U. S. 238', groups={'volume': '479', 'reporter': 'U. S.', 'page': '238'}, metadata=FullCaseCitation.Metadata(parenthetical='MCFL', pin_cite='249', year='1986', court='scotus', plaintiff='Comm’n', defendant='Massachusetts Citizens for Life, Inc.', extra=None)), ShortCaseCitation('479 U. S., at 257', groups={'volume': '479', 'reporter': 'U. S.', 'page': '257'}, metadata=ShortCaseCitation.Metadata(parenthetical=None, pin_cite='257', year=None, court='scotus', antecedent_guess='MCFL')), ShortCaseCitation('479 U. S., at 260', groups={'volume': '479', 'reporter': 'U. S.', 'page': '260'}, metadata=ShortCaseCitation.Metadata(parenthetical=None, pin_cite='260', year=None, court='scotus', antecedent_guess='MCFL')), ShortCaseCitation('479 U. S., at 262', groups={'volume': '479', 'reporter': 'U. S.', 'page': '262'}, metadata=ShortCaseCitation.Metadata(parenthetical=None, pin_cite='262', year=None, court='scotus', antecedent_guess='MCFL')), FullCaseCitation('479 U. S. 238', groups={'volume': '479', 'reporter': 'U. S.', 'page': '238'}, metadata=FullCaseCitation.Metadata(parenthetical=None, pin_cite=None, year='1986', court='scotus', plaintiff='Comm’n', defendant='Massachusetts Citizens for Life, Inc.', extra=None)), FullCaseCitation('479 U. S. 238', groups={'volume': '479', 'reporter': 'U. S.', 'page': '238'}, metadata=FullCaseCitation.Metadata(parenthetical='MCFL', pin_cite=None, year='1986', court='scotus', plaintiff='FEC', defendant='Massachusetts Citizens for Life, Inc.', extra=None)), FullCaseCitation('479 U. S. 238', groups={'volume': '479', 'reporter': 'U. S.', 'page': '238'}, metadata=FullCaseCitation.Metadata(parenthetical=None, pin_cite=None, year=None, court='scotus', plaintiff=None, defendant=None, extra=None)), FullCaseCitation('479 U. S. 238', groups={'volume': '479', 'reporter': 'U. S.', 'page': '238'}, metadata=FullCaseCitation.Metadata(parenthetical=None, pin_cite=None, year=None, court='scotus', plaintiff=None, defendant=None, extra=None)), IdCitation('id.,', metadata=IdCitation.Metadata(parenthetical='quoting NRWC, 459 U. S., at 209–210', pin_cite='at 256')), ShortCaseCitation('479 U. S., at 257', groups={'volume': '479', 'reporter': 'U. S.', 'page': '257'}, metadata=ShortCaseCitation.Metadata(parenthetical='internal quotation marks omitted', pin_cite='257', year=None, court='scotus', antecedent_guess=None)), IdCitation('id.,', metadata=IdCitation.Metadata(parenthetical='internal quotation marks omitted', pin_cite='at 260')), IdCitation('id.,', metadata=IdCitation.Metadata(parenthetical=None, pin_cite='at 257')), ShortCaseCitation('479 U. S., at 268', groups={'volume': '479', 'reporter': 'U. S.', 'page': '268'}, metadata=ShortCaseCitation.Metadata(parenthetical='opinion of Rehnquist, C. J.', pin_cite='268', year=None, court='scotus', antecedent_guess=None)), ShortCaseCitation('479 U. S., at 257', groups={'volume': '479', 'reporter': 'U. S.', 'page': '257'}, metadata=ShortCaseCitation.Metadata(parenthetical=None, pin_cite='257', year=None, court='scotus', antecedent_guess='MCFL')), ShortCaseCitation('479 U. S., at 264', groups={'volume': '479', 'reporter': 'U. S.', 'page': '264'}, metadata=ShortCaseCitation.Metadata(parenthetical=None, pin_cite='264', year=None, court='scotus', antecedent_guess=None)), IdCitation('Ibid.', metadata=IdCitation.Metadata(parenthetical=None, pin_cite=None)), IdCitation('Ibid.', metadata=IdCitation.Metadata(parenthetical=None, pin_cite=None)), ShortCaseCitation('479 U. S., at 259', groups={'volume': '479', 'reporter': 'U. S.', 'page': '259'}, metadata=ShortCaseCitation.Metadata(parenthetical=None, pin_cite='259, n. 12', year=None, court='scotus', antecedent_guess='MCFL')), ShortCaseCitation('479 U. S., at 258', groups={'volume': '479', 'reporter': 'U. S.', 'page': '258'}, metadata=ShortCaseCitation.Metadata(parenthetical=None, pin_cite='258', year=None, court='scotus', antecedent_guess='MCFL')), ShortCaseCitation('479 U. S., at 258', groups={'volume': '479', 'reporter': 'U. S.', 'page': '258'}, metadata=ShortCaseCitation.Metadata(parenthetical=None, pin_cite='258', year=None, court='scotus', antecedent_guess='MCFL')), FullCaseCitation('479 U. S. 238', groups={'volume': '479', 'reporter': 'U. S.', 'page': '238'}, metadata=FullCaseCitation.Metadata(parenthetical=None, pin_cite='264', year='1986', court='scotus', plaintiff=None, defendant='MCFL', extra=None)), ShortCaseCitation('479 U. S., at 259', groups={'volume': '479', 'reporter': 'U. S.', 'page': '259'}, metadata=ShortCaseCitation.Metadata(parenthetical=None, pin_cite='259', year=None, court='scotus', antecedent_guess='MCFL')), ShortCaseCitation('479 U. S., at 258', groups={'volume': '479', 'reporter': 'U. S.', 'page': '258'}, metadata=ShortCaseCitation.Metadata(parenthetical=None, pin_cite='258', year=None, court='scotus', antecedent_guess='MCFL'))]\n"
]
}
],
@@ -456,31 +446,31 @@
"name": "stdout",
"output_type": "stream",
"text": [
- "(Slip Opinion) OCTOBER TERM, 2009 1\n",
- "\n",
- " Syllabus\n",
- "\n",
- " NOTE: Where it is feasible, a syllabus (headnote) will be released, as is\n",
- " being done in connection with this case, at the time the opinion is issued.\n",
- " The syllabus constitutes no part of the opinion of the Court but has been\n",
- " prepared by the Reporter of Decisions for the convenience of the reader.\n",
- " See United States v. Detroit Timber & Lumber Co., 200 U. S. 321, 337.\n",
- "\n",
- "\n",
- "SUPREME COURT OF THE UNITED STATES\n",
- "\n",
- " Syllabus\n",
- "\n",
- " CITIZENS UNITED v. FEDERAL ELECTION\n",
- "\n",
- " COMMISSION \n",
- "\n",
- "\n",
- "APPEAL FROM THE UNITED STATES DISTRICT COURT FOR THE\n",
- " DISTRICT OF COLUMBIA\n",
- "\n",
- "No. 08–205. Argued March 24, 2009—Reargued September 9, 2009––\n",
- " Decided January 21, 2010\n",
+ "(Slip Opinion) OCTOBER TERM, 2009 1\r\n",
+ "\r\n",
+ " Syllabus\r\n",
+ "\r\n",
+ " NOTE: Where it is feasible, a syllabus (headnote) will be released, as is\r\n",
+ " being done in connection with this case, at the time the opinion is issued.\r\n",
+ " The syllabus constitutes no part of the opinion of the Court but has been\r\n",
+ " prepared by the Reporter of Decisions for the convenience of the reader.\r\n",
+ " See United States v. Detroit Timber & Lumber Co., 200 U. S. 321, 337.\r\n",
+ "\r\n",
+ "\r\n",
+ "SUPREME COURT OF THE UNITED STATES\r\n",
+ "\r\n",
+ " Syllabus\r\n",
+ "\r\n",
+ " CITIZENS UNITED v. FEDERAL ELECTION\r\n",
+ "\r\n",
+ " COMMISSION \r\n",
+ "\r\n",
+ "\r\n",
+ "APPEAL FROM THE UNITED STATES DISTRICT COURT FOR THE\r\n",
+ " DISTRICT OF COLUMBIA\r\n",
+ "\r\n",
+ "No. 08–205. Argued March 24, 2009—Reargued September 9, 2009––\r\n",
+ " Decided January 21, 2010\r\n",
"As amended by §2\n"
]
}
@@ -496,6 +486,14 @@
"source": [
"Nice!"
]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "3ea3af4c",
+ "metadata": {},
+ "outputs": [],
+ "source": []
}
],
"metadata": {
@@ -514,7 +512,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.10.16"
+ "version": "3.8.5"
}
},
"nbformat": 4,
From ad00622ab74945decb12c971323f53d834092e4b Mon Sep 17 00:00:00 2001
From: Gianfranco Rossi
Date: Tue, 28 Jan 2025 10:31:28 -0500
Subject: [PATCH 18/40] version bump v2.6.5
---
CHANGES.md | 15 +++++++++++++--
pyproject.toml | 2 +-
2 files changed, 14 insertions(+), 3 deletions(-)
diff --git a/CHANGES.md b/CHANGES.md
index 1964edf8..2fab0145 100644
--- a/CHANGES.md
+++ b/CHANGES.md
@@ -16,14 +16,25 @@ Fixes:
## Current
-**2.6.4 - 2024-06-03**
+**2.6.5 - 2025-01-28**
+
+Features:
+
+- Add ReferenceCitation model and associated logic
Fixes:
-- Bump eyecite to for InvalidError/hyperscan bug
+- Fix court string matching with whitespace
+- Fix court name issues
## Past
+**2.6.4 - 2024-06-03**
+
+Fixes:
+
+- Bump eyecite to for InvalidError/hyperscan bug
+
**2.6.3 - 2024-04-09**
Fixes:
diff --git a/pyproject.toml b/pyproject.toml
index 82167dc1..fbbc42d0 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,5 +1,5 @@
[tool.poetry]
-version = "2.6.4"
+version = "2.6.5"
authors = ["Free Law Project "]
classifiers = [
"Development Status :: 5 - Production/Stable",
From 4bfcc46f5abfad13fc5e54df5444b9b4b34c1229 Mon Sep 17 00:00:00 2001
From: Gianfranco Rossi
Date: Wed, 29 Jan 2025 20:12:56 -0500
Subject: [PATCH 19/40] fix(annotate_citations): try to include HTML style tags
if not balanced
Some annotations, specially for ReferenceCitations, are discarded in HTML sources because some style tags (mostly i or em) are not balanced. This PR tries to include the style tags in the citation span
- Adds tests for `utils.maybe_balance_style_tags`
- Add a logger.error call when the unbalanced HTML could not be fixed
Solves #196
---
eyecite/annotate.py | 31 +++++++++---
eyecite/utils.py | 54 ++++++++++++++++++++
tests/test_AnnotateTest.py | 100 ++++++++++++++++++++++++++++++++++++-
3 files changed, 176 insertions(+), 9 deletions(-)
diff --git a/eyecite/annotate.py b/eyecite/annotate.py
index 7f5ba46e..d3558b6c 100644
--- a/eyecite/annotate.py
+++ b/eyecite/annotate.py
@@ -1,11 +1,18 @@
from bisect import bisect_left, bisect_right
from difflib import SequenceMatcher
from functools import partial
+from logging import getLogger
from typing import Any, Callable, Iterable, Optional, Tuple
import fast_diff_match_patch
-from eyecite.utils import is_balanced_html, wrap_html_tags
+from eyecite.utils import (
+ is_balanced_html,
+ maybe_balance_style_tags,
+ wrap_html_tags,
+)
+
+logger = getLogger("eyecite")
def annotate_citations(
@@ -59,6 +66,9 @@ def annotate_citations(
Returns:
The annotated text.
"""
+ if unbalanced_tags not in ["unchecked", "skip", "wrap"]:
+ raise ValueError(f"Unknown option '{unbalanced_tags}")
+
# set up offset_updater if we have to move annotations to source_text
offset_updater = None
if source_text and source_text != plain_text:
@@ -88,13 +98,20 @@ def annotate_citations(
# handle HTML tags
if unbalanced_tags == "unchecked":
pass
- elif unbalanced_tags in ("skip", "wrap"):
- if not is_balanced_html(span_text):
- if unbalanced_tags == "skip":
- continue
+ elif not is_balanced_html(span_text):
+ if unbalanced_tags == "wrap":
span_text = wrap_html_tags(span_text, after, before)
- else:
- raise ValueError(f"Unknown option '{unbalanced_tags}")
+ else: # "skip" case
+ original_span_text = span_text
+ start, end, span_text = maybe_balance_style_tags(
+ start, end, plain_text
+ )
+ if not is_balanced_html(span_text):
+ logger.error(
+ "Citation was not annotated due to unbalanced tags %s",
+ original_span_text,
+ )
+ continue
if annotator is not None:
annotated_span = annotator(before, span_text, after)
diff --git a/eyecite/utils.py b/eyecite/utils.py
index b2bb66a1..c606a32e 100644
--- a/eyecite/utils.py
+++ b/eyecite/utils.py
@@ -130,3 +130,57 @@ def hash_sha256(dictionary: dict) -> int:
# Calculate the hash of the bytes, convert to an int, and return
return int.from_bytes(hashlib.sha256(json_bytes).digest(), byteorder="big")
+
+
+def maybe_balance_style_tags(
+ start: int, end: int, plain_text: str
+) -> tuple[int, int, str]:
+ """Try to include style tags at the edge of the span marked as invalid
+
+ In some HTML sources the citations are styled with tags like or
+ When the citation is found in a stripped-of-tags text, the span may
+ leave out the opening or closing tag. When this happens and we try to
+ annotate the HTML, it will render invalid HTML. This happens mostly with
+ IdCitation, ReferenceCitation, etc.
+
+ This function will try to find opening or closing tags inmediately
+ preceding or following the citation span. If it finds them, it will
+ return the new start, end and span. If not, it will return the old ones
+
+ :param start: the original start of the span
+ :param end: the origina end of the span
+ :param plain_text: the text to annotate
+ :return: a tuple (new start, new end, new span text)
+ """
+ span_text = plain_text[start:end]
+ style_tags = ["i", "em", "b"]
+ tolerance = 5 # tolerate at most this amount of whitespace
+
+ for tag in style_tags:
+ opening_tag = f"<{tag}>"
+ closing_tag = f"{tag}>"
+ has_opening = opening_tag in span_text
+ has_closing = closing_tag in span_text
+ if has_opening and not has_closing:
+ # look for closing tag after the end
+ extended_end = max(
+ end + len(closing_tag) + tolerance, len(plain_text)
+ )
+ if end_match := re.search(
+ rf"{span_text}\s*{closing_tag}",
+ plain_text[start:extended_end],
+ flags=re.MULTILINE,
+ ):
+ end = start + end_match.end()
+
+ if not has_opening and has_closing:
+ # look for opening tag before the start
+ extended_start = min(start - len(opening_tag) - tolerance, 0)
+ if start_match := re.search(
+ rf"{opening_tag}\s*{span_text}",
+ plain_text[extended_start:end],
+ flags=re.MULTILINE,
+ ):
+ start = extended_start + start_match.start()
+
+ return start, end, plain_text[start:end]
diff --git a/tests/test_AnnotateTest.py b/tests/test_AnnotateTest.py
index e61c7af3..7572b828 100644
--- a/tests/test_AnnotateTest.py
+++ b/tests/test_AnnotateTest.py
@@ -12,6 +12,7 @@ def straighten_quotes(text):
def lower_annotator(before, text, after):
return before + text.lower() + after
+ self.maxDiff = None
test_pairs = (
# single cite
("1 U.S. 1", "<0>1 U.S. 10>", []),
@@ -59,10 +60,10 @@ def lower_annotator(before, text, after):
"foo <0>1 U.S. 10> bar",
["html", "inline_whitespace"],
),
- # whitespace and html -- skip unbalanced tags
+ # whitespace and html -- unbalanced tags are repaired
(
"foo 1 U.S. 1; 2 U.S. 2",
- "foo 1 U.S. 1; <1>2 U.S. 21>",
+ "foo <0>1 U.S. 10>; <1>2 U.S. 21>",
["html", "inline_whitespace"],
{"unbalanced_tags": "skip"},
),
@@ -101,6 +102,94 @@ def lower_annotator(before, text, after):
[],
{"annotator": lower_annotator},
),
+ # solvable unbalanced tag. Need the FullCaseCitation first
+ # so the ReferenceCitation can be found
+ # from https://www.courtlistener.com/api/rest/v4/opinions/8496639/
+ # source: Opinion.xml_harvard
+ (
+ " partially secured by a debtor’s principal residence was not "
+ "con-firmable. Nobelman v. Am. Sav. Bank, "
+ "508 U.S. 324, 113 S.Ct. 2106, 124 L.Ed.2d 228 (1993). That "
+ "plan proposed to bifurcate the claim and... pay the unsecured"
+ "... only by a lien on the debtor’s principal residence.” "
+ "Nobelman at 332, 113 S.Ct. 2106. Section 1123(b)(5) "
+ "codifies the Nobelman decision in individual debtor "
+ "chapter 11 cases.",
+ " partially secured by a debtor’s principal residence was not"
+ " con-firmable. Nobelman v. Am. Sav. Bank, "
+ "508 U.S. 324, "
+ "113 S.Ct. 2106, 124 L.Ed.2d 228"
+ " (1993). That plan proposed to bifurcate the claim and..."
+ " pay the unsecured... only by a lien on the debtor’s"
+ " principal residence.” Nobelman "
+ "at 332, 113 S.Ct. 2106. Section"
+ " 1123(b)(5) codifies the Nobelman decision in"
+ " individual debtor chapter 11 cases.",
+ ["html", "all_whitespace"],
+ {"annotate_anchors": True, "unbalanced_tags": "skip"},
+ ),
+ # solvable unbalanced tag
+ # from https://www.courtlistener.com/api/rest/v4/opinions/2841253/
+ # source: Opinion.html
+ (
+ "he has not agreed so to submit.’” Howsam v. Dean"
+ " Witter Reynolds, Inc., 537 U.S. 79, 83, 123 S. Ct."
+ " 588, 591 (2002) (combined mandamus and"
+ " interlocutory appeal) (citing Howsam at 84, 123"
+ " S. Ct. at 592)",
+ "he has not agreed so to submit.’” Howsam v. Dean"
+ " Witter Reynolds, Inc., 537 U.S."
+ " 79, 83, 123 S. Ct. 588, 591"
+ " (2002) (combined mandamus and interlocutory appeal)"
+ " (citing Howsam at 84, 123 S. Ct. at 592)",
+ ["html", "all_whitespace"],
+ {"annotate_anchors": True, "unbalanced_tags": "skip"},
+ ),
+ # The next 2 examples could be resolved if we increased the
+ # character tolerance or admitted the full case name instead of
+ # just one of the parties
+ (
+ # https://www.courtlistener.com/api/rest/v4/opinions/1535649/
+ # source: xml_harvard
+ "See also Styler v. Tall Oaks, Inc. (In re Hatch),"
+ " 93 B.R. 263, 267 (Bankr.D. Utah 1988),"
+ " rev'd 114 B.R. 747 (D.Utah 1989)."
+ "
... The court makes no"
+ " determination as to whe Fifth Amendment to the"
+ " constitution of the United States.” Styler v."
+ " Tall Oaks, Inc. (In re Hatch), at 748."
+ "",
+ "See also Styler v. Tall Oaks, Inc. (In re Hatch),"
+ " 93 B.R. 263, 267"
+ " (Bankr.D. Utah 1988), rev'd 114 B.R. 747 (D.Utah 1989)."
+ "... The court makes no"
+ " determination as to whe Fifth Amendment to the"
+ " constitution of the United States.” Styler v."
+ " Tall Oaks, Inc. (In re Hatch), at 748."
+ "",
+ ["html", "all_whitespace"],
+ {"annotate_anchors": True, "unbalanced_tags": "skip"},
+ ),
+ (
+ # https://www.courtlistener.com/api/rest/v4/opinions/1985850/
+ # source: html_lawbox
+ "to act rationally. See, e.g., State v."
+ " Wingler, 25 N.J. 161, 175, 135 A.2d"
+ " 468 (1957); citing, ... have been applied.'"
+ " [State v. Wingler at 175, 135 A.2d"
+ " 468, citing, Minnesota ex rel.",
+ "to act rationally. See, e.g., State v."
+ " Wingler, 25 N.J."
+ " 161, 175, 135 A.2d"
+ " 468 (1957); citing, ... have been applied.'"
+ " [State v. Wingler at 175, 135 A.2d 468, citing,"
+ " Minnesota ex rel.",
+ ["html", "all_whitespace"],
+ {"annotate_anchors": True, "unbalanced_tags": "skip"},
+ ),
)
for source_text, expected, clean_steps, *annotate_kwargs in test_pairs:
annotate_kwargs = annotate_kwargs[0] if annotate_kwargs else {}
@@ -115,6 +204,13 @@ def lower_annotator(before, text, after):
(c.span(), f"<{i}>", f"{i}>")
for i, c in enumerate(cites)
]
+
+ if annotate_kwargs.pop("annotate_anchors", False):
+ annotations = [
+ (c.span(), "", "")
+ for c in cites
+ ]
+
annotated = annotate_citations(
plain_text,
annotations,
From e0e79f156e6b10076ab3af6b480f59377d3196d5 Mon Sep 17 00:00:00 2001
From: Gianfranco Rossi
Date: Fri, 31 Jan 2025 19:31:11 -0500
Subject: [PATCH 20/40] feat(ReferenceCitation): use resolved_case_name and
resolved_case_name_short for search and resolution
Fixes #199
- updated ReferenceCitation and FullCaseCitation models metadata to admit resolved_case_name_short and resolved_case_name
- update helpers.filter_citations to save metadata of duplicated ReferenceCitations on the kept object
- update tests to show how the finding and resolution will work
---
eyecite/find.py | 9 +++--
eyecite/helpers.py | 14 +++++++
eyecite/models.py | 12 ++++++
eyecite/resolve.py | 42 ++++++++++----------
tests/test_FindTest.py | 29 ++++++++++++++
tests/test_ResolveTest.py | 83 +++++++++++++++++++++++++++------------
6 files changed, 138 insertions(+), 51 deletions(-)
diff --git a/eyecite/find.py b/eyecite/find.py
index c3cc4c3f..90985bed 100644
--- a/eyecite/find.py
+++ b/eyecite/find.py
@@ -79,7 +79,7 @@ def get_citations(
# Check for reference citations that follow a full citation
# Using the plaintiff or defendant
- references = _extract_reference_citations(citation, plain_text)
+ references = extract_reference_citations(citation, plain_text)
citations.extend(references)
# CASE 2: Token is an "Id." or "Ibid." reference.
@@ -124,8 +124,9 @@ def get_citations(
return citations
-def _extract_reference_citations(
- citation: FullCitation, plain_text: str
+def extract_reference_citations(
+ citation: FullCitation,
+ plain_text: str,
) -> List[ReferenceCitation]:
"""Extract reference citations that follow a full citation
@@ -156,7 +157,7 @@ def is_valid_name(name: str) -> bool:
regexes = [
rf"(?P<{key}>{re.escape(value)})"
- for key in ["plaintiff", "defendant"]
+ for key in ReferenceCitation.name_fields
if (value := getattr(citation.metadata, key, None))
and is_valid_name(value)
]
diff --git a/eyecite/helpers.py b/eyecite/helpers.py
index 9ab88f53..82e63b59 100644
--- a/eyecite/helpers.py
+++ b/eyecite/helpers.py
@@ -12,6 +12,7 @@
FullJournalCitation,
FullLawCitation,
ParagraphToken,
+ ReferenceCitation,
ResourceCitation,
StopWordToken,
Token,
@@ -336,6 +337,19 @@ def filter_citations(citations: List[CitationBase]) -> List[CitationBase]:
last_span = last_citation.span()
current_span = citation.span()
+ if current_span == last_span and isinstance(
+ last_citation, ReferenceCitation
+ ):
+ # a single ReferenceCitation may be found via different
+ # names. Save the name metadata to account for collisions
+ for field in ReferenceCitation.name_fields:
+ if not getattr(last_citation.metadata, field):
+ setattr(
+ last_citation.metadata,
+ field,
+ getattr(citation.metadata, field),
+ )
+
if current_span[0] <= last_span[1]:
# Remove overlapping citations that can occur in edge cases
continue
diff --git a/eyecite/models.py b/eyecite/models.py
index c0fc5e25..cf3db602 100644
--- a/eyecite/models.py
+++ b/eyecite/models.py
@@ -456,6 +456,9 @@ class Metadata(CaseCitation.Metadata):
plaintiff: Optional[str] = None
defendant: Optional[str] = None
extra: Optional[str] = None
+ # May be populated after citation resolution
+ resolved_case_name_short: Optional[str] = None
+ resolved_case_name: Optional[str] = None
def add_metadata(self, words: "Tokens"):
"""Extract metadata from text before and after citation."""
@@ -604,6 +607,15 @@ class Metadata(CitationBase.Metadata):
plaintiff: Optional[str] = None
defendant: Optional[str] = None
pin_cite: Optional[str] = None
+ resolved_case_name_short: Optional[str] = None
+ resolved_case_name: Optional[str] = None
+
+ name_fields = [
+ "plaintiff",
+ "defendant",
+ "resolved_case_name_short",
+ "resolved_case_name",
+ ]
@dataclass(eq=False, unsafe_hash=False, repr=False)
diff --git a/eyecite/resolve.py b/eyecite/resolve.py
index 7f09ccfb..6defca58 100644
--- a/eyecite/resolve.py
+++ b/eyecite/resolve.py
@@ -84,29 +84,28 @@ def _filter_by_matching_antecedent(
return matches[0] if len(matches) == 1 else None
-def _filter_by_matching_plaintiff_or_defendant(
+def _filter_by_matching_plaintiff_or_defendant_or_resolved_names(
resolved_full_cites: ResolvedFullCites,
- plaintiff: str,
- defendant: str,
+ reference_citation: ReferenceCitation,
) -> Optional[ResourceType]:
- """Filter out any impossible reference citations"""
+ """Filter out reference citations that point to more than 1 Resource"""
matches: List[ResourceType] = []
for full_citation, resource in resolved_full_cites:
if not isinstance(full_citation, FullCaseCitation):
continue
- defendant_match = (
- defendant
- and full_citation.metadata.defendant
- and defendant in full_citation.metadata.defendant
- )
- plaintiff_match = (
- plaintiff
- and full_citation.metadata.plaintiff
- and plaintiff in full_citation.metadata.plaintiff
- )
- if defendant_match or plaintiff_match:
- matches.append(resource)
+
+ for key in ReferenceCitation.name_fields:
+ reference_value = getattr(reference_citation.metadata, key)
+ full_case_value = getattr(full_citation.metadata, key)
+ if (
+ reference_value
+ and full_case_value
+ and reference_value in full_case_value
+ ):
+ matches.append(resource)
+ break
+
# Remove duplicates and only accept if one candidate remains
matches = list(set(matches))
return matches[0] if len(matches) == 1 else None
@@ -216,18 +215,19 @@ def _resolve_reference_citation(
"""Resolve reference citations
Try to resolve reference citations by checking whether their is only one
- full citation that appears with either the defendant or plaintiff
+ full citation that appears with either the defendant or plaintiff or
+ resolved_case_name_short or resolved_case_name
field of any of the previously resolved full citations.
"""
if (
not reference_citation.metadata.defendant
and not reference_citation.metadata.plaintiff
+ and not reference_citation.metadata.resolved_case_name_short
+ and not reference_citation.metadata.resolved_case_name
):
return None
- return _filter_by_matching_plaintiff_or_defendant(
- resolved_full_cites,
- reference_citation.metadata.plaintiff,
- reference_citation.metadata.defendant,
+ return _filter_by_matching_plaintiff_or_defendant_or_resolved_names(
+ resolved_full_cites, reference_citation
)
diff --git a/tests/test_FindTest.py b/tests/test_FindTest.py
index 8f2da29a..291c9ae1 100644
--- a/tests/test_FindTest.py
+++ b/tests/test_FindTest.py
@@ -4,6 +4,8 @@
from unittest import TestCase
from eyecite import clean_text, get_citations
+from eyecite.find import extract_reference_citations
+from eyecite.helpers import filter_citations
# by default tests use a cache for speed
# call tests with `EYECITE_CACHE_DIR= python ...` to disable cache
@@ -858,3 +860,30 @@ def test_citation_fullspan(self):
self.assertEqual(
extracted.full_span(), (start_idx, len(sentence)), error_msg
)
+
+ def test_reference_extraction(self):
+ """Can we extract a reference citation using resolved metadata?"""
+ texts = [
+ # In this case the reference citation got with the
+ # resolved_case_name is redundant, was already got in the regular
+ # process. Can we deduplicate?
+ """See, e.g., State v. Wingler, 135 A. 2d 468 (1957);
+ [State v. Wingler at 175, citing, Minnesota ex rel.]""",
+ # In this case the resolved_case_name actually helps getting the
+ # reference citation
+ """See, e.g., State v. W1ngler, 135 A. 2d 468 (1957);
+ [State v. Wingler at 175, citing, Minnesota ex rel.]""",
+ ]
+ for plain_text in texts:
+ citations = get_citations(plain_text)
+ citations[0].metadata.resolved_case_name = "State v. Wingler"
+ references = extract_reference_citations(citations[0], plain_text)
+ final_citations = filter_citations(citations + references)
+ self.assertEqual(
+ len(final_citations), 2, "There should only be 2 citations"
+ )
+ self.assertEqual(
+ len(references),
+ 1,
+ "Only a reference citation should had been picked up",
+ )
diff --git a/tests/test_ResolveTest.py b/tests/test_ResolveTest.py
index 6f644123..6c4539c4 100644
--- a/tests/test_ResolveTest.py
+++ b/tests/test_ResolveTest.py
@@ -3,6 +3,8 @@
from unittest import TestCase
from eyecite import get_citations
+from eyecite.find import extract_reference_citations
+from eyecite.helpers import filter_citations
from eyecite.models import FullCitation, Resource
from eyecite.resolve import resolve_citations
@@ -32,42 +34,55 @@ def assertResolution(self, citations, expected_resolution_dict):
)
def checkReferenceResolution(
- self, *expected_resolutions: tuple[list[list[int]], str]
+ self,
+ expected_indices: list[list[int]],
+ citation_text: str,
+ resolved_case_name_short: Optional[str] = None,
):
"""
Helper function to help test reference citations.
Args:
- *expected_resolutions (tuple[list[int], str]):
- A list of tuples where each tuple contains:
- - A list of expected indices for the resolved citations.
- - A string of citation text to process.
-
+ expected_indices: A list of expected indices for the resolved
+ citations.
+ citation_text: A string of citation text to process.
+ resolved_case_name_short: a case name for simulating post-resolution
+ metadata assignment to full case citations; this will also be
+ used as a flag to use a second round of reference extractions
Returns:
None
"""
- for expected_indices, citation_text in expected_resolutions:
- citations = get_citations(citation_text)
+ citations = get_citations(citation_text)
+ if resolved_case_name_short:
+ citations[0].metadata.resolved_case_name_short = (
+ resolved_case_name_short
+ )
+ citations.extend(
+ extract_reference_citations(
+ citations[0], citation_text # type: ignore[arg-type]
+ )
+ )
+ citations = filter_citations(citations)
- # Step 2: Build a helper dict to map corrected citations to indices
- resolution_index_map = {
- cite.corrected_citation(): idx
- for idx, cite in enumerate(citations)
- }
+ # Step 2: Build a helper dict to map corrected citations to indices
+ resolution_index_map = {
+ cite.corrected_citation(): idx
+ for idx, cite in enumerate(citations)
+ }
- # Step 3: Resolve citations and format the resolution
- resolved_citations = resolve_citations(citations)
- formatted_resolution = format_resolution(resolved_citations)
+ # Step 3: Resolve citations and format the resolution
+ resolved_citations = resolve_citations(citations)
+ formatted_resolution = format_resolution(resolved_citations)
- # Step 4: Map resolved citations to their indices
- result = {
- key: [resolution_index_map[value] for value in values]
- for key, values in formatted_resolution.items()
- }
+ # Step 4: Map resolved citations to their indices
+ result = {
+ key: [resolution_index_map[value] for value in values]
+ for key, values in formatted_resolution.items()
+ }
- # Step 5: Compare the actual results with expected indices
- actual_indices = list(result.values())
- self.assertEqual(expected_indices, actual_indices)
+ # Step 5: Compare the actual results with expected indices
+ actual_indices = list(result.values())
+ self.assertEqual(expected_indices, actual_indices)
def checkResolution(
self, *expected_resolutions: Tuple[Optional[int], str]
@@ -337,7 +352,7 @@ def test_complex_resolution(self):
)
def test_reference_resolution(self):
- self.checkReferenceResolution(
+ for test_tuple in (
([[0, 1]], "Foo v. Bar, 1 U.S. 1 ... Foo at 2"),
([[0]], "Foo at 2. .... ; Foo v. Bar, 1 U.S. 1"),
(
@@ -352,4 +367,20 @@ def test_reference_resolution(self):
[[0, 2], [1]],
"Foo v. Bar 1 U.S. 12, 347-348; In Smith, 12 U.S. 1 (1999) we saw something else. someting. In Foo at 2 we see that",
),
- )
+ # Ok resolved_case_name and order, ReferenceCitation should be resolved
+ (
+ [[0, 1], [2]],
+ "State v. Dze 3 U.S. 22; something something. In Doe at 122, something more. In State v. Doe 4 U.S. 33",
+ "Doe",
+ ),
+ # due to the reference matching more than 1 full citation, we don't
+ # resolve
+ (
+ [[0], [1]],
+ "State v. Smlth 3 U.S. 22; something something. In State v. Smith 4 U.S. 33. In Smith at 122, something more",
+ "Smith",
+ ),
+ # ambiguous resolved_case_name, ReferenceCitation should not be
+ # resolved
+ ):
+ self.checkReferenceResolution(*test_tuple)
From a5d398304f6155689f59855c267a62473495fbde Mon Sep 17 00:00:00 2001
From: William Palin
Date: Wed, 5 Feb 2025 14:02:44 -0500
Subject: [PATCH 21/40] feat(references): Add unallowed reference names
---
eyecite/find.py | 2 +
eyecite/utils.py | 103 +++++++++++++++++++++++++++++++++++++++++
tests/test_FindTest.py | 6 +++
3 files changed, 111 insertions(+)
diff --git a/eyecite/find.py b/eyecite/find.py
index 90985bed..50375569 100644
--- a/eyecite/find.py
+++ b/eyecite/find.py
@@ -29,6 +29,7 @@
)
from eyecite.regexes import SHORT_CITE_ANTECEDENT_REGEX, SUPRA_ANTECEDENT_REGEX
from eyecite.tokenizers import Tokenizer, default_tokenizer
+from eyecite.utils import DISALLOWED_NAMES
def get_citations(
@@ -153,6 +154,7 @@ def is_valid_name(name: str) -> bool:
and name[0].isupper()
and not name.endswith(".")
and not name.isdigit()
+ and name.lower() not in DISALLOWED_NAMES
)
regexes = [
diff --git a/eyecite/utils.py b/eyecite/utils.py
index c606a32e..4c490ab2 100644
--- a/eyecite/utils.py
+++ b/eyecite/utils.py
@@ -4,6 +4,107 @@
from lxml import etree
+# Names not allowed to be reference citations
+# this is partially taken from juriscraper
+DISALLOWED_NAMES = [
+ # Common options
+ 'state',
+ "united states",
+ "people",
+ "commonwealth",
+ "mass",
+ "commissioner"
+ # AGs
+ "Akerman",
+ "Ashcroft",
+ "Barr",
+ "Bates",
+ "Bell",
+ "Berrien",
+ "Biddle",
+ "Black",
+ "Bonaparte",
+ "Bork",
+ "Bondi",
+ "Bradford",
+ "Breckinridge",
+ "Brewster",
+ "Brownell",
+ "Butler",
+ "Civiletti",
+ "Clark",
+ "Clement",
+ "Clifford",
+ "Crittenden",
+ "Cummings",
+ "Cushing",
+ "Daugherty",
+ "Devens",
+ "Evarts",
+ "Filip",
+ "Garland",
+ "Gerson",
+ "Gilpin",
+ "Gonzales",
+ "Gregory",
+ "Griggs",
+ "Grundy",
+ "Harmon",
+ "Hoar",
+ "Holder",
+ "Jackson",
+ "Johnson",
+ "Katzenbach",
+ "Keisler",
+ "Kennedy",
+ "Kleindienst",
+ "Knox",
+ "Lee",
+ "Legaré",
+ "Levi",
+ "Lincoln",
+ "Lynch",
+ "MacVeagh",
+ "Mason",
+ "McGranery",
+ "McGrath",
+ "McKenna",
+ "McReynolds",
+ "Meese",
+ "Miller",
+ "Mitchell",
+ "Moody",
+ "Mukasey",
+ "Murphy",
+ "Nelson",
+ "Olney",
+ "Palmer",
+ "Pierrepont",
+ "Pinkney",
+ "Randolph",
+ "Reno",
+ "Richardson",
+ "Rodney",
+ "Rogers",
+ "Rush",
+ "Sargent",
+ "Saxbe",
+ "Sessions",
+ "Smith",
+ "Speed",
+ "Stanbery",
+ "Stanton",
+ "Stone",
+ "Taft",
+ "Taney",
+ "Thornburgh",
+ "Toucey",
+ "Whitacker",
+ "Wickersham",
+ "Williams",
+ "Wirt",
+]
+
def strip_punct(text: str) -> str:
"""Strips punctuation from a given string
@@ -184,3 +285,5 @@ def maybe_balance_style_tags(
start = extended_start + start_match.start()
return start, end, plain_text[start:end]
+
+
diff --git a/tests/test_FindTest.py b/tests/test_FindTest.py
index 291c9ae1..00a334a9 100644
--- a/tests/test_FindTest.py
+++ b/tests/test_FindTest.py
@@ -502,6 +502,12 @@ def test_find_citations(self):
'defendant': 'Bar',
'pin_cite': '347-348'}),
reference_citation('Foo at 62', metadata={'plaintiff': 'Foo', 'pin_cite': '62'})]),
+ ('Foo v. United States 1 U.S. 12, 347-348. something something ... the United States at 1776 we see that and Foo at 62',
+ [case_citation(page='12',
+ metadata={'plaintiff': 'Foo',
+ 'defendant': 'United States',
+ 'pin_cite': '347-348'}),
+ reference_citation('Foo at 62', metadata={'plaintiff': 'Foo', 'pin_cite': '62'})]),
# Test that reference citation must occur after full case citation
('In Foo at 62 we see that, Foo v. Bar 1 U.S. 12, 347-348. something something,',
[case_citation(page='12',
From 25c08d0a004cc026338a9b97f7968bff8d8a3555 Mon Sep 17 00:00:00 2001
From: William Palin
Date: Wed, 5 Feb 2025 14:05:16 -0500
Subject: [PATCH 22/40] fix(references): Lint
---
eyecite/utils.py | 4 +---
1 file changed, 1 insertion(+), 3 deletions(-)
diff --git a/eyecite/utils.py b/eyecite/utils.py
index 4c490ab2..0b920d3d 100644
--- a/eyecite/utils.py
+++ b/eyecite/utils.py
@@ -8,7 +8,7 @@
# this is partially taken from juriscraper
DISALLOWED_NAMES = [
# Common options
- 'state',
+ "state",
"united states",
"people",
"commonwealth",
@@ -285,5 +285,3 @@ def maybe_balance_style_tags(
start = extended_start + start_match.start()
return start, end, plain_text[start:end]
-
-
From 40619c71c7c25704a29e6f132d8d21e73928cc0f Mon Sep 17 00:00:00 2001
From: William Palin
Date: Thu, 6 Feb 2025 10:57:23 -0500
Subject: [PATCH 23/40] docs(readme): Update readme
Add our two step reference citation
resolution to the readme.rst
---
README.rst | 39 +++++++++++++++++++++++++++++++++++++++
1 file changed, 39 insertions(+)
diff --git a/README.rst b/README.rst
index d6fbf845..7e2e7f73 100644
--- a/README.rst
+++ b/README.rst
@@ -119,6 +119,45 @@ Extracting Citations
that might refer to more than one reporter and can't be narrowed down by date.
3. :code:`tokenizer` ==> Tokenizer, default :code:`eyecite.tokenizers.default_tokenizer`: An instance of a Tokenizer object (see "Tokenizers" below).
+Resolving Reference Citations
+-----------------------------
+
+Eyecite now supports a two-step process for extracting and resolving reference citations. This feature improves handling of citations that reference previously mentioned cases without explicitly repeating the full case name or citation.
+
+Reference citations, such as “Theatre Enterprises at 552”, can be difficult to extract accurately if a judge is citing to `Theatre Enterprises, Inc. v. Paramount Film Distributing Corp., 346 U. S. 537, 541 (1954)` they lack a full case name. To address this, Eyecite allows for an initial citation extraction, followed by a secondary reference resolution step. If you have an external database (e.g., CourtListener) that provides resolved case names, you can use this feature to enhance citation finding.
+
+from eyecite import get_citations
+from eyecite.find import extract_reference_citations
+from eyecite.helpers import filter_citations
+
+plain_text = (
+ "quoting Theatre Enterprises, Inc. v. Paramount Film Distributing Corp., 346 U. S. 537, 541 (1954); "
+ "alterations in original). Thus, the District Court understood that allegations of "
+ "parallel business conduct, taken alone, do not state a claim under § 1; "
+ "plaintiffs must allege additional facts that “ten[d] to exclude independent "
+ "self-interested conduct as an As Theatre Enterprises at 552 held, parallel"
+)
+
+::
+
+ from eyecite import get_citations
+ from eyecite.find import extract_reference_citations
+ from eyecite.helpers import filter_citations
+
+ # Step 1: Extract full citations
+ citations = get_citations(plain_text)
+
+ # Step 2: Resolve the case name from an external database or prior knowledge
+ citations[0].metadata.resolved_case_name_short = "Theatre Enterprises"
+
+ # Step 3: Extract reference citations using the resolved name
+ references = extract_reference_citations(citations[0], plain_text)
+
+ # Step 4: Filter and merge citations
+ new_citations = filter_citations(citations + references)
+
+Keep in mind that this feature requires an external database or heuristic method to resolve the short case name before extracting reference citations a second time.
+
Cleaning Input Text
-------------------
From 2b4fc21f6b2436766f5dc093bdd0ba8612a864a7 Mon Sep 17 00:00:00 2001
From: William Palin
Date: Thu, 6 Feb 2025 15:29:57 -0500
Subject: [PATCH 24/40] fix(find.py): Correct span calculation for short-form
citations Short-form citations were incorrectly identifying the span and full
span of a citation.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
For example:
And Twombly, 550 U. S., at 555 …
Currently, when an antecedent guess is identified, it is not factored into the full span calculation. Additionally, the pin-cite is not correctly incorporated into the offset. This fix ensures both are properly accounted for.
---
eyecite/find.py | 5 +++++
1 file changed, 5 insertions(+)
diff --git a/eyecite/find.py b/eyecite/find.py
index 90985bed..5e50d7be 100644
--- a/eyecite/find.py
+++ b/eyecite/find.py
@@ -246,7 +246,10 @@ def _extract_shortform_citation(
strings_only=True,
forward=False,
)
+ offset = 0
if m:
+ ante_start, ante_end = m.span()
+ offset = ante_end - ante_start
antecedent_guess = m["antecedent"].strip()
# Get pin_cite
@@ -262,6 +265,8 @@ def _extract_shortform_citation(
exact_editions=cite_token.exact_editions,
variation_editions=cite_token.variation_editions,
span_end=span_end,
+ full_span_start=cite_token.start - offset,
+ full_span_end=max([span_end, cite_token.end]),
metadata={
"antecedent_guess": antecedent_guess,
"pin_cite": pin_cite,
From e0cff410e0ab6a2d449540729f6f954af5513fce Mon Sep 17 00:00:00 2001
From: William Palin
Date: Thu, 6 Feb 2025 15:34:58 -0500
Subject: [PATCH 25/40] fix(lint): Fix lint issue
---
eyecite/find.py | 1 +
1 file changed, 1 insertion(+)
diff --git a/eyecite/find.py b/eyecite/find.py
index 5e50d7be..6efeda97 100644
--- a/eyecite/find.py
+++ b/eyecite/find.py
@@ -257,6 +257,7 @@ def _extract_shortform_citation(
pin_cite, span_end, parenthetical = extract_pin_cite(
words, index, prefix=cite_token.groups["page"]
)
+ span_end = span_end if span_end else 0
# make ShortCaseCitation
citation = ShortCaseCitation(
From 7c14e8b90c85924ecd362b864af4c3afc963af19 Mon Sep 17 00:00:00 2001
From: William Palin
Date: Thu, 6 Feb 2025 16:16:28 -0500
Subject: [PATCH 26/40] feat(helpers): Update eyecite filtering
For our expanded reference citations
we need to check if we have overlapping citations
to avoid scenarios were we failed to parse
things correctly
---
eyecite/helpers.py | 40 +++++----
tests/test_FindTest.py | 179 +++++------------------------------------
2 files changed, 46 insertions(+), 173 deletions(-)
diff --git a/eyecite/helpers.py b/eyecite/helpers.py
index 82e63b59..3d48f4a2 100644
--- a/eyecite/helpers.py
+++ b/eyecite/helpers.py
@@ -318,6 +318,16 @@ def disambiguate_reporters(
]
+def overlapping_citations(cite1, cite2) -> bool:
+ """Check if citations overlap at all
+
+ Returns: True or false
+ """
+ start_1, end_1 = cite1.full_span()
+ start_2, end_2 = cite2.full_span()
+ return max(start_1, start_2) < min(end_1, end_2)
+
+
def filter_citations(citations: List[CitationBase]) -> List[CitationBase]:
"""Filter and order citations, ensuring reference citations are in sequence
@@ -330,18 +340,23 @@ def filter_citations(citations: List[CitationBase]) -> List[CitationBase]:
:return: Sorted and filtered citations
"""
filtered_citations: List[CitationBase] = []
- sorted_citations = sorted(citations, key=lambda citation: citation.span())
+ sorted_citations = sorted(
+ citations, key=lambda citation: citation.full_span()
+ )
for citation in sorted_citations:
if filtered_citations:
last_citation = filtered_citations[-1]
- last_span = last_citation.span()
- current_span = citation.span()
-
- if current_span == last_span and isinstance(
- last_citation, ReferenceCitation
- ):
- # a single ReferenceCitation may be found via different
- # names. Save the name metadata to account for collisions
+ is_overlapping = overlapping_citations(citation, last_citation)
+ if is_overlapping and isinstance(last_citation, ReferenceCitation):
+ # Remove the overlapping reference citation
+ filtered_citations.pop(-1)
+ filtered_citations.append(citation)
+ continue
+ if is_overlapping and isinstance(citation, ReferenceCitation):
+ # Skip overlapping reference citations
+ continue
+ filtered_citations.append(citation)
+ if isinstance(last_citation, ReferenceCitation):
for field in ReferenceCitation.name_fields:
if not getattr(last_citation.metadata, field):
setattr(
@@ -349,11 +364,8 @@ def filter_citations(citations: List[CitationBase]) -> List[CitationBase]:
field,
getattr(citation.metadata, field),
)
-
- if current_span[0] <= last_span[1]:
- # Remove overlapping citations that can occur in edge cases
- continue
- filtered_citations.append(citation)
+ else:
+ filtered_citations.append(citation)
return filtered_citations
diff --git a/tests/test_FindTest.py b/tests/test_FindTest.py
index 291c9ae1..4c22a9f0 100644
--- a/tests/test_FindTest.py
+++ b/tests/test_FindTest.py
@@ -724,166 +724,27 @@ def test_date_in_editions(self):
% (edition[0], year, expected, date_in_reporter),
)
- def test_disambiguate_citations(self):
- # fmt: off
- test_pairs = [
- # 1. P.R.R --> Correct abbreviation for a reporter.
- ('1 P.R.R. 1',
- [case_citation(reporter='P.R.R.')]),
- # 2. U. S. --> A simple variant to resolve.
- ('1 U. S. 1',
- [case_citation(reporter_found='U. S.')]),
- # 3. A.2d --> Not a variant, but needs to be looked up in the
- # EDITIONS variable.
- ('1 A.2d 1',
- [case_citation(reporter='A.2d')]),
- # 4. A. 2d --> An unambiguous variant of an edition
- ('1 A. 2d 1',
- [case_citation(reporter='A.2d', reporter_found='A. 2d')]),
- # 5. P.R. --> A variant of 'Pen. & W.', 'P.R.R.', or 'P.' that's
- # resolvable by year
- ('1 P.R. 1 (1831)',
- # Of the three, only Pen & W. was being published this year.
- [case_citation(reporter='Pen. & W.',
- year=1831, reporter_found='P.R.')]),
- # 5.1: W.2d --> A variant of an edition that either resolves to
- # 'Wis. 2d' or 'Wash. 2d' and is resolvable by year.
- ('1 W.2d 1 (1854)',
- # Of the two, only Wis. 2d was being published this year.
- [case_citation(reporter='Wis. 2d',
- year=1854, reporter_found='W.2d')]),
- # 5.2: Wash. --> A non-variant that has more than one reporter for
- # the key, but is resolvable by year
- ('1 Wash. 1 (1890)',
- [case_citation(reporter='Wash.', year=1890)]),
- # 6. Cr. --> A variant of Cranch, which is ambiguous, except with
- # paired with this variation.
- ('1 Cra. 1',
- [case_citation(reporter='Cranch', reporter_found='Cra.',
- metadata={'court': 'scotus'})]),
- # 7. Cranch. --> Not a variant, but could refer to either Cranch's
- # Supreme Court cases or his DC ones. In this case, we cannot
- # disambiguate. Years are not known, and we have no further
- # clues. We must simply drop Cranch from the results.
- ('1 Cranch 1 1 U.S. 23',
- [case_citation(page='23')]),
- # 8. Unsolved problem. In theory, we could use parallel citations
- # to resolve this, because Rob is getting cited next to La., but
- # we don't currently know the proximity of citations to each
- # other, so can't use this.
- # - Rob. --> Either:
- # 8.1: A variant of Robards (1862-1865) or
- # 8.2: Robinson's Louisiana Reports (1841-1846) or
- # 8.3: Robinson's Virgina Reports (1842-1865)
- # ('1 Rob. 1 1 La. 1',
- # [case_citation(volume='1', reporter='Rob.', page='1'),
- # case_citation(volume='1', reporter='La.', page='1')]),
- # 9. Johnson #1 should pass and identify the citation
- ('1 Johnson 1 (1890)',
- [case_citation(reporter='N.M. (J.)', reporter_found='Johnson',
- year=1890,
- )]),
- # 10. Johnson #2 should fail to disambiguate with year alone
- ('1 Johnson 1 (1806)', []),
- ]
- # fmt: on
- # all tests in this suite require disambiguation:
- test_pairs = [
- pair + ({"remove_ambiguous": True},) for pair in test_pairs
- ]
- self.run_test_pairs(test_pairs, "Disambiguation")
+ def test_citation_filtering(self):
+ """Can we filter out reference citations safely?"""
- def test_custom_tokenizer(self):
- extractors = []
- for e in EXTRACTORS:
- e = copy(e)
- e.regex = e.regex.replace(r"\.", r"[.,]")
- if hasattr(e, "_compiled_regex"):
- del e._compiled_regex
- extractors.append(e)
- tokenizer = Tokenizer(extractors)
+ # ".... at Conley v. Gibson, 355 Mass. 41, 42 (1999) ..."
- # fmt: off
- test_pairs = [
- ('1 U,S, 1',
- [case_citation(reporter_found='U,S,')]),
- ]
- # fmt: on
- self.run_test_pairs(
- test_pairs, "Custom tokenizer", tokenizers=[tokenizer]
- )
+ citations = [
+ case_citation(volume="355", page='41', reporter_found='U.S.',
+ short=False,
+ span_start=26,
+ span_end=38,
+ full_span_start=8,
+ full_span_end=49,
+ metadata={
+ 'plaintiff': 'Conley',
+ 'defendant': 'Gibson'
+ }
- def test_citation_fullspan(self):
- """Check that the full_span function returns the correct indices."""
-
- # Make sure it works with several citations in one string
- combined_example = "citation number one is Wilson v. Mar. Overseas Corp., 150 F.3d 1, 6-7 ( 1st Cir. 1998); This is different from Commonwealth v. Bauer, 604 A.2d 1098 (Pa.Super. 1992), my second example"
- extracted = get_citations(combined_example)
- # answers format is (citation_index, (full_span_start, full_span_end))
- answers = [(0, (23, 86)), (1, (111, 164))]
- for cit_idx, (start, end) in answers:
- self.assertEqual(
- extracted[cit_idx].full_span()[0],
- start,
- f"full_span start index doesn't match for {extracted[cit_idx]}",
- )
- self.assertEqual(
- extracted[cit_idx].full_span()[1],
- end,
- f"full_span end index doesn't match for {extracted[cit_idx]}",
- )
-
- # full_span should cover the whole string
- simple_examples = [
- "66 B.U. L. Rev. 71 (1986)",
- "5 Minn. L. Rev. 1339, 1341 (1991)",
- "42 U.S.C. § 405(r)(2) (2019)",
- "37 A.L.R.4th 972, 974 (1985)",
- "497 Fed. Appx. 274 (4th Cir. 2012)",
- "Corp. v. Nature's Farm Prods., No. 99 Civ. 9404 (SHS), 2000 U.S. Dist. LEXIS 12335 (S.D.N.Y. Aug. 25, 2000)",
- "Alderson v. Concordia Par. Corr. Facility, 848 F.3d 415 (5th Cir. 2017)",
- ]
- for example in simple_examples:
- extracted = get_citations(example)[0]
- error_msg = "Full span indices for a simple example should be (0, len(example)) "
- self.assertEqual(
- extracted.full_span(), (0, len(example)), error_msg
- )
- # Sentence and correct start_index
- stopword_examples = [
- ("See 66 B.U. L. Rev. 71 (1986)", 4),
- ("Citing 66 B.U. L. Rev. 71 (1986)", 7),
- ]
- for sentence, start_idx in stopword_examples:
- extracted = get_citations(sentence)[0]
- error_msg = "Wrong span for stopword example"
- self.assertEqual(
- extracted.full_span(), (start_idx, len(sentence)), error_msg
- )
-
- def test_reference_extraction(self):
- """Can we extract a reference citation using resolved metadata?"""
- texts = [
- # In this case the reference citation got with the
- # resolved_case_name is redundant, was already got in the regular
- # process. Can we deduplicate?
- """See, e.g., State v. Wingler, 135 A. 2d 468 (1957);
- [State v. Wingler at 175, citing, Minnesota ex rel.]""",
- # In this case the resolved_case_name actually helps getting the
- # reference citation
- """See, e.g., State v. W1ngler, 135 A. 2d 468 (1957);
- [State v. Wingler at 175, citing, Minnesota ex rel.]""",
+ ),
+ reference_citation("Conley", span_start=8, span_end=14),
+ reference_citation("Conley", span_start=18, span_end=24)
]
- for plain_text in texts:
- citations = get_citations(plain_text)
- citations[0].metadata.resolved_case_name = "State v. Wingler"
- references = extract_reference_citations(citations[0], plain_text)
- final_citations = filter_citations(citations + references)
- self.assertEqual(
- len(final_citations), 2, "There should only be 2 citations"
- )
- self.assertEqual(
- len(references),
- 1,
- "Only a reference citation should had been picked up",
- )
+ self.assertEqual(len(citations), 3)
+ filtered_citations = filter_citations(citations)
+ self.assertEqual(len(filtered_citations), 1)
From f90eb61774ea5b0777f9c37e4506bdb13155e32f Mon Sep 17 00:00:00 2001
From: William Palin
Date: Thu, 6 Feb 2025 16:31:58 -0500
Subject: [PATCH 27/40] tests(find_test): Add test for filter
---
tests/test_FindTest.py | 14 +++++++++++---
1 file changed, 11 insertions(+), 3 deletions(-)
diff --git a/tests/test_FindTest.py b/tests/test_FindTest.py
index 4c22a9f0..86e0c116 100644
--- a/tests/test_FindTest.py
+++ b/tests/test_FindTest.py
@@ -9,7 +9,7 @@
# by default tests use a cache for speed
# call tests with `EYECITE_CACHE_DIR= python ...` to disable cache
-from eyecite.models import ResourceCitation
+from eyecite.models import ResourceCitation, FullCitation, FullCaseCitation
from eyecite.test_factories import (
case_citation,
id_citation,
@@ -725,9 +725,16 @@ def test_date_in_editions(self):
)
def test_citation_filtering(self):
- """Can we filter out reference citations safely?"""
+ """Ensure citations with overlapping spans are correctly filtered
- # ".... at Conley v. Gibson, 355 Mass. 41, 42 (1999) ..."
+ Imagine a scenario where
+ .... at Conley v. Gibson, 355 Mass. 41, 42 (1999) ...
+ this returns two reference citations Conley, Gibson and the full citation
+ this shouldn't occur but if it did we would be able to filter these
+ correcly
+ """
+
+ ".... at Conley v. Gibson, 355 Mass. 41, 42 (1999) ..."
citations = [
case_citation(volume="355", page='41', reporter_found='U.S.',
@@ -748,3 +755,4 @@ def test_citation_filtering(self):
self.assertEqual(len(citations), 3)
filtered_citations = filter_citations(citations)
self.assertEqual(len(filtered_citations), 1)
+ self.assertEqual(type(filtered_citations[0]), FullCaseCitation)
From ebc72e870e37400566b06fd7442fce81512e92f8 Mon Sep 17 00:00:00 2001
From: William Palin
Date: Thu, 6 Feb 2025 18:03:18 -0500
Subject: [PATCH 28/40] fix(resolve): Fix span end for citations
We assigned span end for parenetheticals prior
to refinining the parenthetical
Also - remove duplicate reference citations
at the start
Finally fix a bug in filtering
---
eyecite/find.py | 6 ++++++
eyecite/helpers.py | 13 +++++--------
eyecite/resolve.py | 28 +++++++++++++---------------
3 files changed, 24 insertions(+), 23 deletions(-)
diff --git a/eyecite/find.py b/eyecite/find.py
index 90985bed..87a58e1a 100644
--- a/eyecite/find.py
+++ b/eyecite/find.py
@@ -81,6 +81,9 @@ def get_citations(
# Using the plaintiff or defendant
references = extract_reference_citations(citation, plain_text)
citations.extend(references)
+ # if a duplicate citation is found from another citation
+ # remove it essentially - we resolve this later
+ citations = list(set(citations))
# CASE 2: Token is an "Id." or "Ibid." reference.
# In this case, the citation should simply be to the item cited
@@ -334,3 +337,6 @@ def _extract_id_citation(
"parenthetical": parenthetical,
},
)
+
+
+# [he has not agreed so to submit.’” Howsam v. Dean Witter Reynolds, Inc. , 537 U.S. 79, (2002) (combined mandamus and interlocutory appeal) (citing Howsam at 84, 123 S. Ct. at 592)]
diff --git a/eyecite/helpers.py b/eyecite/helpers.py
index 3d48f4a2..f970fd48 100644
--- a/eyecite/helpers.py
+++ b/eyecite/helpers.py
@@ -101,6 +101,11 @@ def add_post_citation(citation: CaseCitation, words: Tokens) -> None:
citation.metadata.pin_cite = clean_pin_cite(m["pin_cite"]) or None
citation.metadata.extra = (m["extra"] or "").strip() or None
citation.metadata.parenthetical = process_parenthetical(m["parenthetical"])
+ if m["parenthetical"] != None:
+ if len(m["parenthetical"]) > len(citation.metadata.parenthetical):
+ citation.full_span_end = citation.full_span_end - (
+ len(m["parenthetical"]) - len(citation.metadata.parenthetical)
+ )
citation.metadata.year = m["year"]
if m["year"]:
citation.year = get_year(m["year"])
@@ -356,14 +361,6 @@ def filter_citations(citations: List[CitationBase]) -> List[CitationBase]:
# Skip overlapping reference citations
continue
filtered_citations.append(citation)
- if isinstance(last_citation, ReferenceCitation):
- for field in ReferenceCitation.name_fields:
- if not getattr(last_citation.metadata, field):
- setattr(
- last_citation.metadata,
- field,
- getattr(citation.metadata, field),
- )
else:
filtered_citations.append(citation)
return filtered_citations
diff --git a/eyecite/resolve.py b/eyecite/resolve.py
index 6defca58..a0274bcf 100644
--- a/eyecite/resolve.py
+++ b/eyecite/resolve.py
@@ -91,22 +91,20 @@ def _filter_by_matching_plaintiff_or_defendant_or_resolved_names(
"""Filter out reference citations that point to more than 1 Resource"""
matches: List[ResourceType] = []
- for full_citation, resource in resolved_full_cites:
- if not isinstance(full_citation, FullCaseCitation):
- continue
-
- for key in ReferenceCitation.name_fields:
- reference_value = getattr(reference_citation.metadata, key)
- full_case_value = getattr(full_citation.metadata, key)
- if (
- reference_value
- and full_case_value
- and reference_value in full_case_value
- ):
- matches.append(resource)
- break
+ match_count = 0
+ reference_values = []
+ for key in ReferenceCitation.name_fields:
+ reference_value = getattr(reference_citation.metadata, key)
+ if reference_value:
+ reference_values.append(reference_value)
+ for citation, resource in resolved_full_cites:
+ full_cite_values = list(
+ [value for value in citation.metadata.__dict__.values() if value]
+ )
+ if set(full_cite_values) & set(reference_values):
+ match_count += 1
+ matches.append(resource)
- # Remove duplicates and only accept if one candidate remains
matches = list(set(matches))
return matches[0] if len(matches) == 1 else None
From e213bb312a08cd727e95dfb695334b28f9bf1f4e Mon Sep 17 00:00:00 2001
From: William Palin
Date: Thu, 6 Feb 2025 18:09:33 -0500
Subject: [PATCH 29/40] fix(tests): Lint
---
tests/test_FindTest.py | 25 ++++++++++++-------------
1 file changed, 12 insertions(+), 13 deletions(-)
diff --git a/tests/test_FindTest.py b/tests/test_FindTest.py
index 86e0c116..1c855271 100644
--- a/tests/test_FindTest.py
+++ b/tests/test_FindTest.py
@@ -737,20 +737,19 @@ def test_citation_filtering(self):
".... at Conley v. Gibson, 355 Mass. 41, 42 (1999) ..."
citations = [
- case_citation(volume="355", page='41', reporter_found='U.S.',
- short=False,
- span_start=26,
- span_end=38,
- full_span_start=8,
- full_span_end=49,
- metadata={
- 'plaintiff': 'Conley',
- 'defendant': 'Gibson'
- }
-
- ),
+ case_citation(
+ volume="355",
+ page="41",
+ reporter_found="U.S.",
+ short=False,
+ span_start=26,
+ span_end=38,
+ full_span_start=8,
+ full_span_end=49,
+ metadata={"plaintiff": "Conley", "defendant": "Gibson"},
+ ),
reference_citation("Conley", span_start=8, span_end=14),
- reference_citation("Conley", span_start=18, span_end=24)
+ reference_citation("Conley", span_start=18, span_end=24),
]
self.assertEqual(len(citations), 3)
filtered_citations = filter_citations(citations)
From fba0b45c17960b24c195d7eb5f0667976b1773e5 Mon Sep 17 00:00:00 2001
From: William Palin
Date: Thu, 6 Feb 2025 18:12:26 -0500
Subject: [PATCH 30/40] fix(tests): Lint flake8
---
eyecite/find.py | 3 ---
eyecite/helpers.py | 2 +-
tests/test_FindTest.py | 5 +----
3 files changed, 2 insertions(+), 8 deletions(-)
diff --git a/eyecite/find.py b/eyecite/find.py
index 87a58e1a..2512592d 100644
--- a/eyecite/find.py
+++ b/eyecite/find.py
@@ -337,6 +337,3 @@ def _extract_id_citation(
"parenthetical": parenthetical,
},
)
-
-
-# [he has not agreed so to submit.’” Howsam v. Dean Witter Reynolds, Inc. , 537 U.S. 79, (2002) (combined mandamus and interlocutory appeal) (citing Howsam at 84, 123 S. Ct. at 592)]
diff --git a/eyecite/helpers.py b/eyecite/helpers.py
index f970fd48..cb509129 100644
--- a/eyecite/helpers.py
+++ b/eyecite/helpers.py
@@ -101,7 +101,7 @@ def add_post_citation(citation: CaseCitation, words: Tokens) -> None:
citation.metadata.pin_cite = clean_pin_cite(m["pin_cite"]) or None
citation.metadata.extra = (m["extra"] or "").strip() or None
citation.metadata.parenthetical = process_parenthetical(m["parenthetical"])
- if m["parenthetical"] != None:
+ if m["parenthetical"] is not None:
if len(m["parenthetical"]) > len(citation.metadata.parenthetical):
citation.full_span_end = citation.full_span_end - (
len(m["parenthetical"]) - len(citation.metadata.parenthetical)
diff --git a/tests/test_FindTest.py b/tests/test_FindTest.py
index 1c855271..6dfdd4b8 100644
--- a/tests/test_FindTest.py
+++ b/tests/test_FindTest.py
@@ -1,15 +1,13 @@
import os
-from copy import copy
from datetime import datetime
from unittest import TestCase
from eyecite import clean_text, get_citations
-from eyecite.find import extract_reference_citations
from eyecite.helpers import filter_citations
# by default tests use a cache for speed
# call tests with `EYECITE_CACHE_DIR= python ...` to disable cache
-from eyecite.models import ResourceCitation, FullCitation, FullCaseCitation
+from eyecite.models import ResourceCitation, FullCaseCitation
from eyecite.test_factories import (
case_citation,
id_citation,
@@ -21,7 +19,6 @@
)
from eyecite.tokenizers import (
EDITIONS_LOOKUP,
- EXTRACTORS,
AhocorasickTokenizer,
HyperscanTokenizer,
Tokenizer,
From 3cb51814368aa20815f265fb6960f75d16e64420 Mon Sep 17 00:00:00 2001
From: William Palin
Date: Thu, 6 Feb 2025 18:14:13 -0500
Subject: [PATCH 31/40] fix(tests): Lint iSort
---
tests/test_FindTest.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/tests/test_FindTest.py b/tests/test_FindTest.py
index 6dfdd4b8..5137b5fb 100644
--- a/tests/test_FindTest.py
+++ b/tests/test_FindTest.py
@@ -7,7 +7,7 @@
# by default tests use a cache for speed
# call tests with `EYECITE_CACHE_DIR= python ...` to disable cache
-from eyecite.models import ResourceCitation, FullCaseCitation
+from eyecite.models import FullCaseCitation, ResourceCitation
from eyecite.test_factories import (
case_citation,
id_citation,
From 13d3ac5e2bab41d2871c358d4c32819157fb7bb0 Mon Sep 17 00:00:00 2001
From: William Palin
Date: Thu, 6 Feb 2025 18:20:51 -0500
Subject: [PATCH 32/40] fix(tests): Lint typing
---
eyecite/helpers.py | 16 ++++++++--------
1 file changed, 8 insertions(+), 8 deletions(-)
diff --git a/eyecite/helpers.py b/eyecite/helpers.py
index cb509129..d10fa3c0 100644
--- a/eyecite/helpers.py
+++ b/eyecite/helpers.py
@@ -101,10 +101,13 @@ def add_post_citation(citation: CaseCitation, words: Tokens) -> None:
citation.metadata.pin_cite = clean_pin_cite(m["pin_cite"]) or None
citation.metadata.extra = (m["extra"] or "").strip() or None
citation.metadata.parenthetical = process_parenthetical(m["parenthetical"])
- if m["parenthetical"] is not None:
+ if (
+ m["parenthetical"] is not None
+ and citation.metadata.parenthetical is not None
+ ):
if len(m["parenthetical"]) > len(citation.metadata.parenthetical):
- citation.full_span_end = citation.full_span_end - (
- len(m["parenthetical"]) - len(citation.metadata.parenthetical)
+ citation.full_span_end -= len(m["parenthetical"]) - len(
+ citation.metadata.parenthetical
)
citation.metadata.year = m["year"]
if m["year"]:
@@ -323,11 +326,8 @@ def disambiguate_reporters(
]
-def overlapping_citations(cite1, cite2) -> bool:
- """Check if citations overlap at all
-
- Returns: True or false
- """
+def overlapping_citations(cite1: CaseCitation, cite2: CaseCitation) -> bool:
+ """Check if citations overlap at all"""
start_1, end_1 = cite1.full_span()
start_2, end_2 = cite2.full_span()
return max(start_1, start_2) < min(end_1, end_2)
From a7560fea97de27ac25f47d178d292e837aa7e27e Mon Sep 17 00:00:00 2001
From: William Palin
Date: Thu, 6 Feb 2025 18:24:23 -0500
Subject: [PATCH 33/40] fix(tests): Lint typing
---
eyecite/helpers.py | 12 ++++++++----
1 file changed, 8 insertions(+), 4 deletions(-)
diff --git a/eyecite/helpers.py b/eyecite/helpers.py
index d10fa3c0..5f1ec28e 100644
--- a/eyecite/helpers.py
+++ b/eyecite/helpers.py
@@ -326,10 +326,12 @@ def disambiguate_reporters(
]
-def overlapping_citations(cite1: CaseCitation, cite2: CaseCitation) -> bool:
+def overlapping_citations(
+ full_span_1: Tuple[int, int], full_span_2: Tuple[int, int]
+) -> bool:
"""Check if citations overlap at all"""
- start_1, end_1 = cite1.full_span()
- start_2, end_2 = cite2.full_span()
+ start_1, end_1 = full_span_1
+ start_2, end_2 = full_span_2
return max(start_1, start_2) < min(end_1, end_2)
@@ -351,7 +353,9 @@ def filter_citations(citations: List[CitationBase]) -> List[CitationBase]:
for citation in sorted_citations:
if filtered_citations:
last_citation = filtered_citations[-1]
- is_overlapping = overlapping_citations(citation, last_citation)
+ is_overlapping = overlapping_citations(
+ citation.full_span(), last_citation.full_span()
+ )
if is_overlapping and isinstance(last_citation, ReferenceCitation):
# Remove the overlapping reference citation
filtered_citations.pop(-1)
From c2f4ddad162693e145b7792c916a5b543285c6df Mon Sep 17 00:00:00 2001
From: William Palin
Date: Thu, 6 Feb 2025 18:31:08 -0500
Subject: [PATCH 34/40] fix(tests): Lint typing
---
eyecite/helpers.py | 8 +++-----
1 file changed, 3 insertions(+), 5 deletions(-)
diff --git a/eyecite/helpers.py b/eyecite/helpers.py
index 5f1ec28e..55b065d2 100644
--- a/eyecite/helpers.py
+++ b/eyecite/helpers.py
@@ -101,11 +101,9 @@ def add_post_citation(citation: CaseCitation, words: Tokens) -> None:
citation.metadata.pin_cite = clean_pin_cite(m["pin_cite"]) or None
citation.metadata.extra = (m["extra"] or "").strip() or None
citation.metadata.parenthetical = process_parenthetical(m["parenthetical"])
- if (
- m["parenthetical"] is not None
- and citation.metadata.parenthetical is not None
- ):
- if len(m["parenthetical"]) > len(citation.metadata.parenthetical):
+ paren = citation.metadata.parenthetical or ""
+ if m["parenthetical"] is not None:
+ if len(m["parenthetical"]) > len(paren):
citation.full_span_end -= len(m["parenthetical"]) - len(
citation.metadata.parenthetical
)
From ad7a95991ede50cd1bdd00fae62dcc8aede452af Mon Sep 17 00:00:00 2001
From: William Palin
Date: Thu, 6 Feb 2025 18:34:07 -0500
Subject: [PATCH 35/40] fix(tests): Lint typing
---
eyecite/helpers.py | 5 ++---
1 file changed, 2 insertions(+), 3 deletions(-)
diff --git a/eyecite/helpers.py b/eyecite/helpers.py
index 55b065d2..fa980120 100644
--- a/eyecite/helpers.py
+++ b/eyecite/helpers.py
@@ -101,9 +101,8 @@ def add_post_citation(citation: CaseCitation, words: Tokens) -> None:
citation.metadata.pin_cite = clean_pin_cite(m["pin_cite"]) or None
citation.metadata.extra = (m["extra"] or "").strip() or None
citation.metadata.parenthetical = process_parenthetical(m["parenthetical"])
- paren = citation.metadata.parenthetical or ""
- if m["parenthetical"] is not None:
- if len(m["parenthetical"]) > len(paren):
+ if m["parenthetical"] is not None and isinstance(citation.metadata.parenthetical, str):
+ if len(m["parenthetical"]) > len(citation.metadata.parenthetical):
citation.full_span_end -= len(m["parenthetical"]) - len(
citation.metadata.parenthetical
)
From 6da9d6c1823ead93d769bcee3d99c1c0ae943274 Mon Sep 17 00:00:00 2001
From: William Palin
Date: Thu, 6 Feb 2025 18:37:10 -0500
Subject: [PATCH 36/40] fix(tests): Lint typing
---
eyecite/helpers.py | 10 ++++++++--
1 file changed, 8 insertions(+), 2 deletions(-)
diff --git a/eyecite/helpers.py b/eyecite/helpers.py
index fa980120..ac35bd40 100644
--- a/eyecite/helpers.py
+++ b/eyecite/helpers.py
@@ -101,11 +101,17 @@ def add_post_citation(citation: CaseCitation, words: Tokens) -> None:
citation.metadata.pin_cite = clean_pin_cite(m["pin_cite"]) or None
citation.metadata.extra = (m["extra"] or "").strip() or None
citation.metadata.parenthetical = process_parenthetical(m["parenthetical"])
- if m["parenthetical"] is not None and isinstance(citation.metadata.parenthetical, str):
+
+ if (
+ citation.full_span_end
+ and m["parenthetical"] is not None
+ and isinstance(citation.metadata.parenthetical, str)
+ ):
if len(m["parenthetical"]) > len(citation.metadata.parenthetical):
- citation.full_span_end -= len(m["parenthetical"]) - len(
+ offset = len(m["parenthetical"]) - len(
citation.metadata.parenthetical
)
+ citation.full_span_end = citation.full_span_end - offset
citation.metadata.year = m["year"]
if m["year"]:
citation.year = get_year(m["year"])
From 958fc5ba4e5983337ceb64296fab558c37a03ed7 Mon Sep 17 00:00:00 2001
From: William Palin
Date: Fri, 7 Feb 2025 09:12:09 -0500
Subject: [PATCH 37/40] fix(tests): Reintroduce previous tests
---
tests/test_FindTest.py | 179 +++++++++++++++++++++++++++++++++++++++--
1 file changed, 174 insertions(+), 5 deletions(-)
diff --git a/tests/test_FindTest.py b/tests/test_FindTest.py
index 5137b5fb..f9d5f4e2 100644
--- a/tests/test_FindTest.py
+++ b/tests/test_FindTest.py
@@ -1,13 +1,15 @@
import os
+from copy import copy
from datetime import datetime
from unittest import TestCase
from eyecite import clean_text, get_citations
+from eyecite.find import extract_reference_citations
from eyecite.helpers import filter_citations
# by default tests use a cache for speed
# call tests with `EYECITE_CACHE_DIR= python ...` to disable cache
-from eyecite.models import FullCaseCitation, ResourceCitation
+from eyecite.models import FullCaseCitation, FullCitation, ResourceCitation
from eyecite.test_factories import (
case_citation,
id_citation,
@@ -19,6 +21,7 @@
)
from eyecite.tokenizers import (
EDITIONS_LOOKUP,
+ EXTRACTORS,
AhocorasickTokenizer,
HyperscanTokenizer,
Tokenizer,
@@ -724,15 +727,13 @@ def test_date_in_editions(self):
def test_citation_filtering(self):
"""Ensure citations with overlapping spans are correctly filtered
- Imagine a scenario where
+ Imagine a scenario where a bug incorrectly identifies the following
.... at Conley v. Gibson, 355 Mass. 41, 42 (1999) ...
- this returns two reference citations Conley, Gibson and the full citation
+ this returns two reference citations Conley, Gibson and the full cite
this shouldn't occur but if it did we would be able to filter these
correcly
"""
-
".... at Conley v. Gibson, 355 Mass. 41, 42 (1999) ..."
-
citations = [
case_citation(
volume="355",
@@ -752,3 +753,171 @@ def test_citation_filtering(self):
filtered_citations = filter_citations(citations)
self.assertEqual(len(filtered_citations), 1)
self.assertEqual(type(filtered_citations[0]), FullCaseCitation)
+
+ def test_disambiguate_citations(self):
+ # fmt: off
+ test_pairs = [
+ # 1. P.R.R --> Correct abbreviation for a reporter.
+ ('1 P.R.R. 1',
+ [case_citation(reporter='P.R.R.')]),
+ # 2. U. S. --> A simple variant to resolve.
+ ('1 U. S. 1',
+ [case_citation(reporter_found='U. S.')]),
+ # 3. A.2d --> Not a variant, but needs to be looked up in the
+ # EDITIONS variable.
+ ('1 A.2d 1',
+ [case_citation(reporter='A.2d')]),
+ # 4. A. 2d --> An unambiguous variant of an edition
+ ('1 A. 2d 1',
+ [case_citation(reporter='A.2d', reporter_found='A. 2d')]),
+ # 5. P.R. --> A variant of 'Pen. & W.', 'P.R.R.', or 'P.' that's
+ # resolvable by year
+ ('1 P.R. 1 (1831)',
+ # Of the three, only Pen & W. was being published this year.
+ [case_citation(reporter='Pen. & W.',
+ year=1831, reporter_found='P.R.')]),
+ # 5.1: W.2d --> A variant of an edition that either resolves to
+ # 'Wis. 2d' or 'Wash. 2d' and is resolvable by year.
+ ('1 W.2d 1 (1854)',
+ # Of the two, only Wis. 2d was being published this year.
+ [case_citation(reporter='Wis. 2d',
+ year=1854, reporter_found='W.2d')]),
+ # 5.2: Wash. --> A non-variant that has more than one reporter for
+ # the key, but is resolvable by year
+ ('1 Wash. 1 (1890)',
+ [case_citation(reporter='Wash.', year=1890)]),
+ # 6. Cr. --> A variant of Cranch, which is ambiguous, except with
+ # paired with this variation.
+ ('1 Cra. 1',
+ [case_citation(reporter='Cranch', reporter_found='Cra.',
+ metadata={'court': 'scotus'})]),
+ # 7. Cranch. --> Not a variant, but could refer to either Cranch's
+ # Supreme Court cases or his DC ones. In this case, we cannot
+ # disambiguate. Years are not known, and we have no further
+ # clues. We must simply drop Cranch from the results.
+ ('1 Cranch 1 1 U.S. 23',
+ [case_citation(page='23')]),
+ # 8. Unsolved problem. In theory, we could use parallel citations
+ # to resolve this, because Rob is getting cited next to La., but
+ # we don't currently know the proximity of citations to each
+ # other, so can't use this.
+ # - Rob. --> Either:
+ # 8.1: A variant of Robards (1862-1865) or
+ # 8.2: Robinson's Louisiana Reports (1841-1846) or
+ # 8.3: Robinson's Virgina Reports (1842-1865)
+ # ('1 Rob. 1 1 La. 1',
+ # [case_citation(volume='1', reporter='Rob.', page='1'),
+ # case_citation(volume='1', reporter='La.', page='1')]),
+ # 9. Johnson #1 should pass and identify the citation
+ ('1 Johnson 1 (1890)',
+ [case_citation(reporter='N.M. (J.)', reporter_found='Johnson',
+ year=1890,
+ )]),
+ # 10. Johnson #2 should fail to disambiguate with year alone
+ ('1 Johnson 1 (1806)', []),
+ ]
+ # fmt: on
+ # all tests in this suite require disambiguation:
+ test_pairs = [
+ pair + ({"remove_ambiguous": True},) for pair in test_pairs
+ ]
+ self.run_test_pairs(test_pairs, "Disambiguation")
+
+ def test_custom_tokenizer(self):
+ extractors = []
+ for e in EXTRACTORS:
+ e = copy(e)
+ e.regex = e.regex.replace(r"\.", r"[.,]")
+ if hasattr(e, "_compiled_regex"):
+ del e._compiled_regex
+ extractors.append(e)
+ tokenizer = Tokenizer(extractors)
+
+ # fmt: off
+ test_pairs = [
+ ('1 U,S, 1',
+ [case_citation(reporter_found='U,S,')]),
+ ]
+ # fmt: on
+ self.run_test_pairs(
+ test_pairs, "Custom tokenizer", tokenizers=[tokenizer]
+ )
+
+ def test_citation_fullspan(self):
+ """Check that the full_span function returns the correct indices."""
+
+ # Make sure it works with several citations in one string
+ combined_example = "citation number one is Wilson v. Mar. Overseas Corp., 150 F.3d 1, 6-7 ( 1st Cir. 1998); This is different from Commonwealth v. Bauer, 604 A.2d 1098 (Pa.Super. 1992), my second example"
+ extracted = get_citations(combined_example)
+ # answers format is (citation_index, (full_span_start, full_span_end))
+ answers = [(0, (23, 86)), (1, (111, 164))]
+ for cit_idx, (start, end) in answers:
+ self.assertEqual(
+ extracted[cit_idx].full_span()[0],
+ start,
+ f"full_span start index doesn't match for {extracted[cit_idx]}",
+ )
+ self.assertEqual(
+ extracted[cit_idx].full_span()[1],
+ end,
+ f"full_span end index doesn't match for {extracted[cit_idx]}",
+ )
+
+ # full_span should cover the whole string
+ simple_examples = [
+ "66 B.U. L. Rev. 71 (1986)",
+ "5 Minn. L. Rev. 1339, 1341 (1991)",
+ "42 U.S.C. § 405(r)(2) (2019)",
+ "37 A.L.R.4th 972, 974 (1985)",
+ "497 Fed. Appx. 274 (4th Cir. 2012)",
+ "Corp. v. Nature's Farm Prods., No. 99 Civ. 9404 (SHS), 2000 U.S. Dist. LEXIS 12335 (S.D.N.Y. Aug. 25, 2000)",
+ "Alderson v. Concordia Par. Corr. Facility, 848 F.3d 415 (5th Cir. 2017)",
+ ]
+ for example in simple_examples:
+ extracted = get_citations(example)[0]
+ error_msg = "Full span indices for a simple example should be (0, len(example)) "
+ self.assertEqual(
+ extracted.full_span(), (0, len(example)), error_msg
+ )
+ # Sentence and correct start_index
+ stopword_examples = [
+ ("See 66 B.U. L. Rev. 71 (1986)", 4),
+ ("Citing 66 B.U. L. Rev. 71 (1986)", 7),
+ ]
+ for sentence, start_idx in stopword_examples:
+ extracted = get_citations(sentence)[0]
+ error_msg = "Wrong span for stopword example"
+ self.assertEqual(
+ extracted.full_span(), (start_idx, len(sentence)), error_msg
+ )
+
+ def test_reference_extraction(self):
+ """Can we extract a reference citation using resolved metadata?"""
+ texts = [
+ # In this case the reference citation got with the
+ # resolved_case_name is redundant, was already got in the regular
+ # process. Can we deduplicate?
+ """See, e.g., State v. Wingler, 135 A. 2d 468 (1957);
+ [State v. Wingler at 175, citing, Minnesota ex rel.]""",
+ # In this case the resolved_case_name actually helps getting the
+ # reference citation
+ """See, e.g., State v. W1ngler, 135 A. 2d 468 (1957);
+ [State v. Wingler at 175, citing, Minnesota ex rel.]""",
+ ]
+ for plain_text in texts:
+ citations = get_citations(plain_text)
+ found_cite = citations[0]
+ if isinstance(found_cite, FullCitation):
+ found_cite.metadata.resolved_case_name = "State v. Wingler"
+ references = extract_reference_citations(
+ found_cite, plain_text
+ )
+ final_citations = filter_citations(citations + references)
+ self.assertEqual(
+ len(final_citations), 2, "There should only be 2 citations"
+ )
+ self.assertEqual(
+ len(references),
+ 1,
+ "Only a reference citation should had been picked up",
+ )
From 8e882d8e8cfe057283e377ceb27cb631683a126b Mon Sep 17 00:00:00 2001
From: William Palin
Date: Fri, 7 Feb 2025 09:47:09 -0500
Subject: [PATCH 38/40] fix(filtering): Move filtering of duplicates
Move duplicate filtering to filter citations
and filter on span
previous set would remove duplicate tokens
---
eyecite/find.py | 5 +----
eyecite/helpers.py | 1 +
2 files changed, 2 insertions(+), 4 deletions(-)
diff --git a/eyecite/find.py b/eyecite/find.py
index d50bb947..54cb5156 100644
--- a/eyecite/find.py
+++ b/eyecite/find.py
@@ -82,9 +82,6 @@ def get_citations(
# Using the plaintiff or defendant
references = extract_reference_citations(citation, plain_text)
citations.extend(references)
- # if a duplicate citation is found from another citation
- # remove it essentially - we resolve this later
- citations = list(set(citations))
# CASE 2: Token is an "Id." or "Ibid." reference.
# In this case, the citation should simply be to the item cited
@@ -169,7 +166,7 @@ def is_valid_name(name: str) -> bool:
if not regexes:
return []
pin_cite_re = (
- rf"\b(?:{'|'.join(regexes)})\s+at\s+(?P\d{{1,5}})\b"
+ rf"\b(?:{'|'.join(regexes)})\s+at(\s¶)?\s+(?P\d{{1,5}})\b"
)
reference_citations = []
remaining_text = plain_text[citation.span()[-1] :]
diff --git a/eyecite/helpers.py b/eyecite/helpers.py
index ac35bd40..1a3d00fc 100644
--- a/eyecite/helpers.py
+++ b/eyecite/helpers.py
@@ -349,6 +349,7 @@ def filter_citations(citations: List[CitationBase]) -> List[CitationBase]:
:param citations: List of citations
:return: Sorted and filtered citations
"""
+ citations = list({citation.span(): citation for citation in citations}.values())
filtered_citations: List[CitationBase] = []
sorted_citations = sorted(
citations, key=lambda citation: citation.full_span()
From c837e842d0f5abcbc439d86bf61fdd2cb00933fc Mon Sep 17 00:00:00 2001
From: William Palin
Date: Fri, 7 Feb 2025 09:51:25 -0500
Subject: [PATCH 39/40] fix(helpers): lint
---
eyecite/helpers.py | 4 +++-
1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/eyecite/helpers.py b/eyecite/helpers.py
index 1a3d00fc..499b5826 100644
--- a/eyecite/helpers.py
+++ b/eyecite/helpers.py
@@ -349,7 +349,9 @@ def filter_citations(citations: List[CitationBase]) -> List[CitationBase]:
:param citations: List of citations
:return: Sorted and filtered citations
"""
- citations = list({citation.span(): citation for citation in citations}.values())
+ citations = list(
+ {citation.span(): citation for citation in citations}.values()
+ )
filtered_citations: List[CitationBase] = []
sorted_citations = sorted(
citations, key=lambda citation: citation.full_span()
From 0e31544b0e7ec8f0fb6db0822404aa19afdccd48 Mon Sep 17 00:00:00 2001
From: William Palin
Date: Fri, 7 Feb 2025 10:21:56 -0500
Subject: [PATCH 40/40] fix(regexes): SHORT CITE ANTECEDENT REGEX
Require a a-z character for the start of
antecdent guesses.
---
eyecite/regexes.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/eyecite/regexes.py b/eyecite/regexes.py
index 5b29ab0b..58bd077a 100644
--- a/eyecite/regexes.py
+++ b/eyecite/regexes.py
@@ -212,7 +212,7 @@ def short_cite_re(regex):
# What case does a short cite refer to? For now, we just capture the previous
# word optionally followed by a comma. Example: Adarand, 515 U.S. at 241.
SHORT_CITE_ANTECEDENT_REGEX = r"""
- (?P[\w\-.]+),?
+ (?P[A-Za-z][\w\-.]+),?
\ # final space
"""