diff --git a/haystack/components/preprocessors/recursive_splitter.py b/haystack/components/preprocessors/recursive_splitter.py index d684871456..c517c7075c 100644 --- a/haystack/components/preprocessors/recursive_splitter.py +++ b/haystack/components/preprocessors/recursive_splitter.py @@ -221,14 +221,26 @@ def _chunk_length(self, text: str) -> int: The length of the chunk in words or characters. """ if self.split_units == "word": - # regex that matches a page break followed by one or multiple whitespaces - if re.match(r"\f\s*", text): - return 1 - return len(text.split(" ")) else: return len(text) + # def _chunk_length(self, text: str) -> int: + # """ + # Split the text by whitespace and count non-empty elements + # Count newline and form feed characters + # + # :param text: + # :return: + # """ + # + # if self.split_units == "word": + # words = [word for word in text.split() if word] + # special_chars = text.count('\n') + text.count('\f') + text.count('\x0c') + # return len(words) + special_chars + # + # return len(text) + def _chunk_text(self, text: str) -> List[str]: """ Recursive chunking algorithm that divides text into smaller chunks based on a list of separator characters. @@ -247,23 +259,22 @@ def _chunk_text(self, text: str) -> List[str]: for curr_separator in self.separators: # type: ignore # the caller already checked that separators is not None if curr_separator == "sentence": - # correct SentenceSplitter initialization is checked at the initialization of the component + # re. ignore: correct SentenceSplitter initialization is checked at the initialization of the component sentence_with_spans = self.nltk_tokenizer.split_sentences(text) # type: ignore splits = [sentence["sentence"] for sentence in sentence_with_spans] else: + # add escape "\" to the separator and wrapped it in a group so that it's included in the splits as well escaped_separator = re.escape(curr_separator) - escaped_separator = ( - f"({escaped_separator})" # wrap the separator in a group to include it in the splits - ) - splits = re.split(escaped_separator, text) + escaped_separator = f"({escaped_separator})" - # merge every two consecutive splits, i.e.: the text and the separator after it + # split the text and merge every two consecutive splits, i.e.: the text and the separator after it + splits = re.split(escaped_separator, text) splits = [ "".join([splits[i], splits[i + 1]]) if i < len(splits) - 1 else splits[i] for i in range(0, len(splits), 2) ] - # remove last split if it is empty + # remove last split if it's empty splits = splits[:-1] if splits[-1] == "" else splits if len(splits) == 1: # go to next separator, if current separator not found in the text diff --git a/test/components/preprocessors/test_recursive_splitter.py b/test/components/preprocessors/test_recursive_splitter.py index 40181bd2c2..d3c86f417d 100644 --- a/test/components/preprocessors/test_recursive_splitter.py +++ b/test/components/preprocessors/test_recursive_splitter.py @@ -519,7 +519,7 @@ def test_run_split_by_word_count_page_breaks_word_unit(): doc_chunks = splitter.run([doc]) doc_chunks = doc_chunks["documents"] - assert len(doc_chunks) == 9 + assert len(doc_chunks) == 5 assert doc_chunks[0].content == "This is some text. " assert doc_chunks[0].meta["page_number"] == 1 assert doc_chunks[0].meta["split_id"] == 0