Skip to content

Commit

Permalink
fixed fail on empty string
Browse files Browse the repository at this point in the history
  • Loading branch information
KennethEnevoldsen committed Apr 19, 2023
1 parent 3f747f2 commit cb55e23
Show file tree
Hide file tree
Showing 4 changed files with 15 additions and 8 deletions.
7 changes: 4 additions & 3 deletions src/textdescriptives/components/coherence.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,10 @@ def n_order_coherence(doc: Doc, order: int) -> List[float]:
Returns:
A list of floats representing the semantic similarity between sentences
"""
sents = list(doc.sents)
if len(sents) < order + 1:
return [np.nan]

if doc.vector.size == 0:
raise ValueError(
"Sentence vectors are not available. Thus it is not possible to "
Expand All @@ -32,9 +36,6 @@ def n_order_coherence(doc: Doc, order: int) -> List[float]:
+ "sentencizer and word vectors to the pipeline?",
)

sents = list(doc.sents)
if len(sents) < order + 1:
return [np.nan]
similarities: List[float] = []
for i, sent in enumerate(sents):
if i == len(sents) - order:
Expand Down
7 changes: 5 additions & 2 deletions src/textdescriptives/components/quality.py
Original file line number Diff line number Diff line change
Expand Up @@ -346,9 +346,12 @@ def oov_ratio(span: Union[Span, Doc], vocab: Optional[Mapping] = None) -> float:
Returns:
float: the out-of-vocabulary ratio
"""
len_span = len(span)
if len_span == 0:
return 0.0
if vocab is None:
return len([token for token in span if token.is_oov]) / len(span)
return len([token for token in span if token.text not in vocab]) / len(span)
return len([token for token in span if token.is_oov]) / len_span
return len([token for token in span if token.text not in vocab]) / len_span


class Quality:
Expand Down
8 changes: 5 additions & 3 deletions tests/test_extractors.py
Original file line number Diff line number Diff line change
Expand Up @@ -186,8 +186,10 @@ def test_extract_metrics_multiple_metrics():
assert "dependency_distance_mean" in df.columns


def test_extract_metrics_all_metrics():
text = "just a little test"

@pytest.mark.parametrize(
"text",
["just a little test", ""],
)
def test_extract_metrics_all_metrics(text: str):
df = td.extract_metrics(text=text, spacy_model="en_core_web_sm", metrics=None)
assert "n_tokens" in df.columns
1 change: 1 addition & 0 deletions tests/test_quality.py
Original file line number Diff line number Diff line change
Expand Up @@ -252,6 +252,7 @@ def test_quality_component_with_config(nlp: spacy.Language):
),
("This test has many symbols #!@#$%^&*()_+.", False),
("- this is a text of \n - bullet points", False),
("", False), # test that it handles empty strings
],
)
def test_passed_quality_check(text: str, passed: bool, nlp: spacy.Language):
Expand Down

0 comments on commit cb55e23

Please sign in to comment.