diff --git a/eyecite/find.py b/eyecite/find.py index 1fbfbd9..c3cc4c3 100644 --- a/eyecite/find.py +++ b/eyecite/find.py @@ -131,44 +131,46 @@ def _extract_reference_citations( :param citation: the full case citation found :param plain_text: the text - :return: Pincite reference citations + :return: Pin cite reference citations """ - if not isinstance(citation, FullCaseCitation): - # Skip if not case law citation + if len(plain_text) <= citation.span()[-1]: return [] - if not citation.metadata.defendant: - # Skip if no defendant exists + if not isinstance(citation, FullCaseCitation): return [] - plaintiff_regex = ( - rf"(?P{re.escape(citation.metadata.plaintiff)})" - if citation.metadata.plaintiff - else "" - ) - defendant_regex = ( - rf"(?P{re.escape(citation.metadata.defendant)})" - if citation.metadata.defendant - else "" - ) - # Combine the components if they are not empty - combined_regex_parts = "|".join( - filter(None, [plaintiff_regex, defendant_regex]) - ) - pin_cite_regex = ( - rf"\b(?:{combined_regex_parts})\s+at\s+(?P\d{{1,5}})\b" - ) + def is_valid_name(name: str) -> bool: + """Validate name isnt a regex issue - pin_cite_pattern = re.compile(pin_cite_regex) - reference_citations = [] - if len(plain_text) <= citation.span()[-1]: - return [] + Excludes strings like Co., numbers or lower case strs + + :param name: The name to check + :return: True if usable, false if not + """ + return ( + isinstance(name, str) + and len(name) > 2 + and name[0].isupper() + and not name.endswith(".") + and not name.isdigit() + ) + regexes = [ + rf"(?P<{key}>{re.escape(value)})" + for key in ["plaintiff", "defendant"] + if (value := getattr(citation.metadata, key, None)) + and is_valid_name(value) + ] + if not regexes: + return [] + pin_cite_re = ( + rf"\b(?:{'|'.join(regexes)})\s+at\s+(?P\d{{1,5}})\b" + ) + reference_citations = [] remaining_text = plain_text[citation.span()[-1] :] offset = citation.span()[-1] - for match in pin_cite_pattern.finditer(remaining_text): + for match in re.compile(pin_cite_re).finditer(remaining_text): start, end = match.span() matched_text = match.group(0) - reference = ReferenceCitation( token=CaseReferenceToken( data=matched_text, start=start + offset, end=end + offset @@ -178,15 +180,7 @@ def _extract_reference_citations( full_span_start=start + offset, full_span_end=end + offset, index=0, - metadata={ - "plaintiff": ( - match.group("plaintiff") - if "plaintiff" in match.groupdict() - else None - ), - "defendant": match.group("defendant"), - "pin_cite": match.group("page"), - }, + metadata=match.groupdict(), ) reference_citations.append(reference) return reference_citations diff --git a/eyecite/helpers.py b/eyecite/helpers.py index c3ab843..c5ccfc6 100644 --- a/eyecite/helpers.py +++ b/eyecite/helpers.py @@ -133,9 +133,11 @@ def add_defendant(citation: CaseCitation, words: Tokens) -> None: break if start_index: citation.full_span_start = citation.span()[0] - offset - citation.metadata.defendant = "".join( + defendant = "".join( str(w) for w in words[start_index : citation.index] ).strip(", ") + if defendant.strip(): + citation.metadata.defendant = defendant def add_law_metadata(citation: FullLawCitation, words: Tokens) -> None: @@ -310,7 +312,7 @@ def disambiguate_reporters( def filter_citations(citations: List[CitationBase]) -> List[CitationBase]: """Filter and order citations that may have reference cites out of order - :param citations: List of citation` + :param citations: List of citations :return: Sorted and filtered citations """ filtered_citations: List[CitationBase] = [] diff --git a/eyecite/models.py b/eyecite/models.py index 8160922..c0fc5e2 100644 --- a/eyecite/models.py +++ b/eyecite/models.py @@ -315,7 +315,9 @@ def is_parallel_citation(self, preceding: CitationBase): and isinstance(preceding, FullCaseCitation) ) if is_parallel: - # if parallel merge plaintiff/defendant data + # if parallel get plaintiff/defendant data from + # the earlier citation, since it won't be on the + # parallel one. self.metadata.defendant = preceding.metadata.defendant self.metadata.plaintiff = preceding.metadata.plaintiff diff --git a/tests/test_FindTest.py b/tests/test_FindTest.py index fbfc678..b1b8124 100644 --- a/tests/test_FindTest.py +++ b/tests/test_FindTest.py @@ -441,6 +441,38 @@ def test_find_citations(self): [],), ('lorem 111 N. W. 12th St.', [],), + # Eyecite has issue with linebreaks when identifying defendants and + # previously could store defendant as only whitespace + ('\n rt. denied,\n \n \n 541 U.S. 1085 (2004);\n \n', + [case_citation( + page='1085', + volume="541", + reporter="U.S.", + year=2004, + metadata={'plaintiff': None, + 'defendant': None, + 'court': 'scotus' + } + )], + {'clean': ['html', 'inline_whitespace']}), + # Test filtering overlapping citations - this finds four citations + # but should filter down to three + ("Miles v. Smith 1 Ga. 1; asdfasdf asd Something v. Else, 1 Miles 3; 1 Miles at 10", + [case_citation(page='1', + volume="1", + reporter="Ga.", + metadata={'plaintiff': 'Miles', + 'defendant': 'Smith', + }), + case_citation(page='3', + volume="1", + reporter="Miles", + metadata={'plaintiff': 'Something', + 'defendant': 'Else'} + ), + case_citation(volume="1", page='10', reporter='Miles', + short=True, + metadata={'pin_cite': '10'})]), ('General Casualty cites as compelling Amick v. Liberty Mut. Ins. Co., 455 A.2d 793 (R.I. 1983). In that case ... Stats, do. See Amick at 795', [case_citation(page='793', volume="455",