fixed fail on empty string

HLasse · Apr 19, 2023 · cb55e23 · cb55e23
1 parent 3f747f2
commit cb55e23
Show file tree

Hide file tree

Showing 4 changed files with 15 additions and 8 deletions.
diff --git a/src/textdescriptives/components/coherence.py b/src/textdescriptives/components/coherence.py
@@ -18,6 +18,10 @@ def n_order_coherence(doc: Doc, order: int) -> List[float]:
     Returns:
         A list of floats representing the semantic similarity between sentences
     """
+    sents = list(doc.sents)
+    if len(sents) < order + 1:
+        return [np.nan]
+
     if doc.vector.size == 0:
         raise ValueError(
             "Sentence vectors are not available. Thus it is not possible to "
@@ -32,9 +36,6 @@ def n_order_coherence(doc: Doc, order: int) -> List[float]:
             + "sentencizer and word vectors to the pipeline?",
         )
 
-    sents = list(doc.sents)
-    if len(sents) < order + 1:
-        return [np.nan]
     similarities: List[float] = []
     for i, sent in enumerate(sents):
         if i == len(sents) - order:

diff --git a/src/textdescriptives/components/quality.py b/src/textdescriptives/components/quality.py
@@ -346,9 +346,12 @@ def oov_ratio(span: Union[Span, Doc], vocab: Optional[Mapping] = None) -> float:
     Returns:
         float: the out-of-vocabulary ratio
     """
+    len_span = len(span)
+    if len_span == 0:
+        return 0.0
     if vocab is None:
-        return len([token for token in span if token.is_oov]) / len(span)
-    return len([token for token in span if token.text not in vocab]) / len(span)
+        return len([token for token in span if token.is_oov]) / len_span
+    return len([token for token in span if token.text not in vocab]) / len_span
 
 
 class Quality:

diff --git a/tests/test_extractors.py b/tests/test_extractors.py
@@ -186,8 +186,10 @@ def test_extract_metrics_multiple_metrics():
     assert "dependency_distance_mean" in df.columns
 
 
-def test_extract_metrics_all_metrics():
-    text = "just a little test"
-
+@pytest.mark.parametrize(
+    "text",
+    ["just a little test", ""],
+)
+def test_extract_metrics_all_metrics(text: str):
     df = td.extract_metrics(text=text, spacy_model="en_core_web_sm", metrics=None)
     assert "n_tokens" in df.columns
diff --git a/tests/test_quality.py b/tests/test_quality.py
@@ -252,6 +252,7 @@ def test_quality_component_with_config(nlp: spacy.Language):
         ),
         ("This test has many symbols #!@#$%^&*()_+.", False),
         ("- this is a text of \n - bullet points", False),
+        ("", False),  # test that it handles empty strings
     ],
 )
 def test_passed_quality_check(text: str, passed: bool, nlp: spacy.Language):