feat: Updates docs and types for language param in PreProcessor (#3186)

* Small update to language param docs in PreProcessor
deepset-ai · Sep 21, 2022 · 861a13a · 861a13a
1 parent e7d4dc3
commit 861a13a
Show file tree

Hide file tree

Showing 2 changed files with 5 additions and 5 deletions.
diff --git a/docs/_src/api/api/preprocessor.md b/docs/_src/api/api/preprocessor.md
@@ -84,7 +84,8 @@ Set the value to 0 to ensure there is no overlap among the documents after split
 - `split_respect_sentence_boundary`: Whether to split in partial sentences if split_by -> `word`. If set
 to True, the individual split will always have complete sentences &
 the number of words will be <= split_length.
-- `language`: The language used by "nltk.tokenize.sent_tokenize" in iso639 format. Available options: "en", "es", "de", "fr" & many more.
+- `language`: The language used by "nltk.tokenize.sent_tokenize" in iso639 format.
+Available options: "ru","sl","es","sv","tr","cs","da","nl","en","et","fi","fr","de","el","it","no","pl","pt","ml"
 - `tokenizer_model_folder`: Path to the folder containing the NTLK PunktSentenceTokenizer models, if loading a model from a local path. Leave empty otherwise.
 - `id_hash_keys`: Generate the document id from a custom list of strings that refer to the document's
 attributes. If you want to ensure you don't have duplicate documents in your DocumentStore but texts are

diff --git a/haystack/nodes/preprocessor/preprocessor.py b/haystack/nodes/preprocessor/preprocessor.py
@@ -39,6 +39,7 @@
     "no": "norwegian",
     "pl": "polish",
     "pt": "portuguese",
+    "ml": "malayalam",
 }
 
 
@@ -79,7 +80,8 @@ def __init__(
         :param split_respect_sentence_boundary: Whether to split in partial sentences if split_by -> `word`. If set
                                                 to True, the individual split will always have complete sentences &
                                                 the number of words will be <= split_length.
-        :param language: The language used by "nltk.tokenize.sent_tokenize" in iso639 format. Available options: "en", "es", "de", "fr" & many more.
+        :param language: The language used by "nltk.tokenize.sent_tokenize" in iso639 format.
+            Available options: "ru","sl","es","sv","tr","cs","da","nl","en","et","fi","fr","de","el","it","no","pl","pt","ml"
         :param tokenizer_model_folder: Path to the folder containing the NTLK PunktSentenceTokenizer models, if loading a model from a local path. Leave empty otherwise.
         :param id_hash_keys: Generate the document id from a custom list of strings that refer to the document's
             attributes. If you want to ensure you don't have duplicate documents in your DocumentStore but texts are
@@ -152,13 +154,10 @@ def process(
         if id_hash_keys is None:
             id_hash_keys = self.id_hash_keys
 
-        ret = []
-
         if isinstance(documents, (Document, dict)):
             ret = self._process_single(document=documents, id_hash_keys=id_hash_keys, **kwargs)  # type: ignore
         elif isinstance(documents, list):
             ret = self._process_batch(documents=list(documents), id_hash_keys=id_hash_keys, **kwargs)
-
         else:
             raise Exception("documents provided to PreProcessor.prepreprocess() is not of type list nor Document")