From 654ec6c991c07289d1daf9192cd38577657c3b05 Mon Sep 17 00:00:00 2001
From: Lasse <lasseh0310@gmail.com>
Date: Tue, 10 Oct 2023 14:44:37 +0200
Subject: [PATCH] feat: return nan for readability metrics requiring syllables
 if they can't be calculated

---
 .../components/readability.py                 | 25 +++++++++++++++++--
 1 file changed, 23 insertions(+), 2 deletions(-)

diff --git a/src/textdescriptives/components/readability.py b/src/textdescriptives/components/readability.py
index ab5c5528..48724ca9 100644
--- a/src/textdescriptives/components/readability.py
+++ b/src/textdescriptives/components/readability.py
@@ -7,7 +7,10 @@
 from spacy.tokens import Doc
 from wasabi import msg
 
-from .descriptive_stats import create_descriptive_stats_component  # noqa
+from .descriptive_stats import (  # noqa
+    create_descriptive_stats_component,
+    language_exists_in_pyphen,
+)
 from .utils import filter_tokens
 
 
@@ -20,6 +23,8 @@ class Readability:
 
     def __init__(self, nlp: Language):
         """Initialise components."""
+        self.can_calculate_syllables = language_exists_in_pyphen(lang=nlp.lang)
+
         if not Doc.has_extension("readability"):
             Doc.set_extension("readability", getter=self.readability)
 
@@ -31,6 +36,9 @@ def _flesch_reading_ease(self, doc: Doc):
 
         Higher = easier to read
         """
+        if not self.can_calculate_syllables:
+            return np.nan
+
         avg_sentence_length = doc._.sentence_length["sentence_length_mean"]
         avg_syl_per_word = doc._.syllables["syllables_per_token_mean"]
         if avg_sentence_length == 0 or avg_syl_per_word == 0:
@@ -44,6 +52,9 @@ def _flesch_kincaid_grade(self, doc: Doc):
 
         0.39 * (avg sent len) + 11.8 * (avg_syl_per_word) - 15.59
         """
+        if not self.can_calculate_syllables:
+            return np.nan
+
         avg_sentence_length = doc._.sentence_length["sentence_length_mean"]
         avg_syl_per_word = doc._.syllables["syllables_per_token_mean"]
         if avg_sentence_length == 0 or avg_syl_per_word == 0:
@@ -60,6 +71,9 @@ def _smog(self, doc: Doc, n_hard_words: int):
         Where hard words are words with 3 or more syllables.
         Preferably need 30+ sentences. Will not work with less than 4
         """
+        if not self.can_calculate_syllables:
+            return np.nan
+
         n_sentences = doc._._n_sentences
         if n_sentences >= 3:
             smog = (1.043 * (30 * (n_hard_words / n_sentences)) ** 0.5) + 3.1291
@@ -74,6 +88,9 @@ def _gunning_fog(self, doc, n_hard_words: int):
 
         Where hard words are word with 3 or more syllables.
         """
+        if not self.can_calculate_syllables:
+            return np.nan
+
         n_tokens = doc._._n_tokens
         if n_tokens == 0:
             return np.nan
@@ -139,7 +156,11 @@ def _rix(self, doc: Doc, long_words: int):
 
     def readability(self, doc: Doc) -> Dict[str, float]:
         """Apply readability functions and return a dict of the results."""
-        hard_words = len([syllable for syllable in doc._._n_syllables if syllable >= 3])
+        hard_words = (
+            len([syllable for syllable in doc._._n_syllables if syllable >= 3])
+            if self.can_calculate_syllables
+            else 0
+        )
         long_words = len([t for t in filter_tokens(doc) if len(t) > 6])
 
         return {