Skip to content

Commit

Permalink
fix(tests): Make reference citation extraction stricter
Browse files Browse the repository at this point in the history
Limit the names that can be used to better formatted
plaintiff/defendants

Add tests to show filtering/ordering reference citaitons
And refactor add defendant for edge case where it could
be only whitespace.
typos etc.
  • Loading branch information
flooie committed Jan 22, 2025
1 parent 6507d01 commit 67df9a8
Show file tree
Hide file tree
Showing 4 changed files with 70 additions and 40 deletions.
68 changes: 31 additions & 37 deletions eyecite/find.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,44 +131,46 @@ def _extract_reference_citations(
:param citation: the full case citation found
:param plain_text: the text
:return: Pincite reference citations
:return: Pin cite reference citations
"""
if not isinstance(citation, FullCaseCitation):
# Skip if not case law citation
if len(plain_text) <= citation.span()[-1]:
return []
if not citation.metadata.defendant:
# Skip if no defendant exists
if not isinstance(citation, FullCaseCitation):
return []
plaintiff_regex = (
rf"(?P<plaintiff>{re.escape(citation.metadata.plaintiff)})"
if citation.metadata.plaintiff
else ""
)
defendant_regex = (
rf"(?P<defendant>{re.escape(citation.metadata.defendant)})"
if citation.metadata.defendant
else ""
)

# Combine the components if they are not empty
combined_regex_parts = "|".join(
filter(None, [plaintiff_regex, defendant_regex])
)
pin_cite_regex = (
rf"\b(?:{combined_regex_parts})\s+at\s+(?P<page>\d{{1,5}})\b"
)
def is_valid_name(name: str) -> bool:
"""Validate name isnt a regex issue
pin_cite_pattern = re.compile(pin_cite_regex)
reference_citations = []
if len(plain_text) <= citation.span()[-1]:
return []
Excludes strings like Co., numbers or lower case strs
:param name: The name to check
:return: True if usable, false if not
"""
return (
isinstance(name, str)
and len(name) > 2
and name[0].isupper()
and not name.endswith(".")
and not name.isdigit()
)

regexes = [
rf"(?P<{key}>{re.escape(value)})"
for key in ["plaintiff", "defendant"]
if (value := getattr(citation.metadata, key, None))
and is_valid_name(value)
]
if not regexes:
return []
pin_cite_re = (
rf"\b(?:{'|'.join(regexes)})\s+at\s+(?P<pin_cite>\d{{1,5}})\b"
)
reference_citations = []
remaining_text = plain_text[citation.span()[-1] :]
offset = citation.span()[-1]
for match in pin_cite_pattern.finditer(remaining_text):
for match in re.compile(pin_cite_re).finditer(remaining_text):
start, end = match.span()
matched_text = match.group(0)

reference = ReferenceCitation(
token=CaseReferenceToken(
data=matched_text, start=start + offset, end=end + offset
Expand All @@ -178,15 +180,7 @@ def _extract_reference_citations(
full_span_start=start + offset,
full_span_end=end + offset,
index=0,
metadata={
"plaintiff": (
match.group("plaintiff")
if "plaintiff" in match.groupdict()
else None
),
"defendant": match.group("defendant"),
"pin_cite": match.group("page"),
},
metadata=match.groupdict(),
)
reference_citations.append(reference)
return reference_citations
Expand Down
6 changes: 4 additions & 2 deletions eyecite/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,9 +133,11 @@ def add_defendant(citation: CaseCitation, words: Tokens) -> None:
break
if start_index:
citation.full_span_start = citation.span()[0] - offset
citation.metadata.defendant = "".join(
defendant = "".join(
str(w) for w in words[start_index : citation.index]
).strip(", ")
if defendant.strip():
citation.metadata.defendant = defendant


def add_law_metadata(citation: FullLawCitation, words: Tokens) -> None:
Expand Down Expand Up @@ -310,7 +312,7 @@ def disambiguate_reporters(
def filter_citations(citations: List[CitationBase]) -> List[CitationBase]:
"""Filter and order citations that may have reference cites out of order
:param citations: List of citation`
:param citations: List of citations
:return: Sorted and filtered citations
"""
filtered_citations: List[CitationBase] = []
Expand Down
4 changes: 3 additions & 1 deletion eyecite/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -315,7 +315,9 @@ def is_parallel_citation(self, preceding: CitationBase):
and isinstance(preceding, FullCaseCitation)
)
if is_parallel:
# if parallel merge plaintiff/defendant data
# if parallel get plaintiff/defendant data from
# the earlier citation, since it won't be on the
# parallel one.
self.metadata.defendant = preceding.metadata.defendant
self.metadata.plaintiff = preceding.metadata.plaintiff

Expand Down
32 changes: 32 additions & 0 deletions tests/test_FindTest.py
Original file line number Diff line number Diff line change
Expand Up @@ -441,6 +441,38 @@ def test_find_citations(self):
[],),
('lorem 111 N. W. 12th St.',
[],),
# Eyecite has issue with linebreaks when identifying defendants and
# previously could store defendant as only whitespace
('<em>\n rt. denied,\n </em>\n \n 541 U.S. 1085 (2004);\n <em>\n',
[case_citation(
page='1085',
volume="541",
reporter="U.S.",
year=2004,
metadata={'plaintiff': None,
'defendant': None,
'court': 'scotus'
}
)],
{'clean': ['html', 'inline_whitespace']}),
# Test filtering overlapping citations - this finds four citations
# but should filter down to three
("Miles v. Smith 1 Ga. 1; asdfasdf asd Something v. Else, 1 Miles 3; 1 Miles at 10",
[case_citation(page='1',
volume="1",
reporter="Ga.",
metadata={'plaintiff': 'Miles',
'defendant': 'Smith',
}),
case_citation(page='3',
volume="1",
reporter="Miles",
metadata={'plaintiff': 'Something',
'defendant': 'Else'}
),
case_citation(volume="1", page='10', reporter='Miles',
short=True,
metadata={'pin_cite': '10'})]),
('General Casualty cites as compelling Amick v. Liberty Mut. Ins. Co., 455 A.2d 793 (R.I. 1983). In that case ... Stats, do. See Amick at 795',
[case_citation(page='793',
volume="455",
Expand Down

0 comments on commit 67df9a8

Please sign in to comment.