diff --git a/docs/_src/api/api/preprocessor.md b/docs/_src/api/api/preprocessor.md index 4a10d0a5ac..fe62a1b815 100644 --- a/docs/_src/api/api/preprocessor.md +++ b/docs/_src/api/api/preprocessor.md @@ -84,7 +84,8 @@ Set the value to 0 to ensure there is no overlap among the documents after split - `split_respect_sentence_boundary`: Whether to split in partial sentences if split_by -> `word`. If set to True, the individual split will always have complete sentences & the number of words will be <= split_length. -- `language`: The language used by "nltk.tokenize.sent_tokenize" in iso639 format. Available options: "en", "es", "de", "fr" & many more. +- `language`: The language used by "nltk.tokenize.sent_tokenize" in iso639 format. +Available options: "ru","sl","es","sv","tr","cs","da","nl","en","et","fi","fr","de","el","it","no","pl","pt","ml" - `tokenizer_model_folder`: Path to the folder containing the NTLK PunktSentenceTokenizer models, if loading a model from a local path. Leave empty otherwise. - `id_hash_keys`: Generate the document id from a custom list of strings that refer to the document's attributes. If you want to ensure you don't have duplicate documents in your DocumentStore but texts are diff --git a/haystack/nodes/preprocessor/preprocessor.py b/haystack/nodes/preprocessor/preprocessor.py index 37f9a90cd0..b5b09f50c1 100644 --- a/haystack/nodes/preprocessor/preprocessor.py +++ b/haystack/nodes/preprocessor/preprocessor.py @@ -39,6 +39,7 @@ "no": "norwegian", "pl": "polish", "pt": "portuguese", + "ml": "malayalam", } @@ -79,7 +80,8 @@ def __init__( :param split_respect_sentence_boundary: Whether to split in partial sentences if split_by -> `word`. If set to True, the individual split will always have complete sentences & the number of words will be <= split_length. - :param language: The language used by "nltk.tokenize.sent_tokenize" in iso639 format. Available options: "en", "es", "de", "fr" & many more. + :param language: The language used by "nltk.tokenize.sent_tokenize" in iso639 format. + Available options: "ru","sl","es","sv","tr","cs","da","nl","en","et","fi","fr","de","el","it","no","pl","pt","ml" :param tokenizer_model_folder: Path to the folder containing the NTLK PunktSentenceTokenizer models, if loading a model from a local path. Leave empty otherwise. :param id_hash_keys: Generate the document id from a custom list of strings that refer to the document's attributes. If you want to ensure you don't have duplicate documents in your DocumentStore but texts are @@ -152,13 +154,10 @@ def process( if id_hash_keys is None: id_hash_keys = self.id_hash_keys - ret = [] - if isinstance(documents, (Document, dict)): ret = self._process_single(document=documents, id_hash_keys=id_hash_keys, **kwargs) # type: ignore elif isinstance(documents, list): ret = self._process_batch(documents=list(documents), id_hash_keys=id_hash_keys, **kwargs) - else: raise Exception("documents provided to PreProcessor.prepreprocess() is not of type list nor Document")