Skip to content

Commit

Permalink
fix: Move potential nltk download to warm_up (#8646)
Browse files Browse the repository at this point in the history
* Move potential nltk download to warm_up

* Update tests

* Add release notes

* Fix tests

* Uncomment

* Make mypy happy

* Add RuntimeError message

* Update release notes

---------

Co-authored-by: Julian Risch <[email protected]>
  • Loading branch information
sjrl and julian-risch authored Dec 20, 2024
1 parent f4d9c2b commit 286061f
Show file tree
Hide file tree
Showing 5 changed files with 90 additions and 17 deletions.
27 changes: 19 additions & 8 deletions haystack/components/preprocessors/document_splitter.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,15 +107,10 @@ def __init__( # pylint: disable=too-many-positional-arguments
splitting_function=splitting_function,
respect_sentence_boundary=respect_sentence_boundary,
)

if split_by == "sentence" or (respect_sentence_boundary and split_by == "word"):
self._use_sentence_splitter = split_by == "sentence" or (respect_sentence_boundary and split_by == "word")
if self._use_sentence_splitter:
nltk_imports.check()
self.sentence_splitter = SentenceSplitter(
language=language,
use_split_rules=use_split_rules,
extend_abbreviations=extend_abbreviations,
keep_white_spaces=True,
)
self.sentence_splitter = None

if split_by == "sentence":
# ToDo: remove this warning in the next major release
Expand Down Expand Up @@ -164,6 +159,18 @@ def _init_checks(
)
self.respect_sentence_boundary = False

def warm_up(self):
"""
Warm up the DocumentSplitter by loading the sentence tokenizer.
"""
if self._use_sentence_splitter and self.sentence_splitter is None:
self.sentence_splitter = SentenceSplitter(
language=self.language,
use_split_rules=self.use_split_rules,
extend_abbreviations=self.extend_abbreviations,
keep_white_spaces=True,
)

@component.output_types(documents=List[Document])
def run(self, documents: List[Document]):
"""
Expand All @@ -182,6 +189,10 @@ def run(self, documents: List[Document]):
:raises TypeError: if the input is not a list of Documents.
:raises ValueError: if the content of a document is None.
"""
if self._use_sentence_splitter and self.sentence_splitter is None:
raise RuntimeError(
"The component DocumentSplitter wasn't warmed up. Run 'warm_up()' before calling 'run()'."
)

if not isinstance(documents, list) or (documents and not isinstance(documents[0], Document)):
raise TypeError("DocumentSplitter expects a List of Documents as input.")
Expand Down
32 changes: 23 additions & 9 deletions haystack/components/preprocessors/nltk_document_splitter.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,16 +77,23 @@ def __init__( # pylint: disable=too-many-positional-arguments
self.respect_sentence_boundary = respect_sentence_boundary
self.use_split_rules = use_split_rules
self.extend_abbreviations = extend_abbreviations
self.sentence_splitter = SentenceSplitter(
language=language,
use_split_rules=use_split_rules,
extend_abbreviations=extend_abbreviations,
keep_white_spaces=True,
)
self.sentence_splitter = None
self.language = language

def warm_up(self):
"""
Warm up the NLTKDocumentSplitter by loading the sentence tokenizer.
"""
if self.sentence_splitter is None:
self.sentence_splitter = SentenceSplitter(
language=self.language,
use_split_rules=self.use_split_rules,
extend_abbreviations=self.extend_abbreviations,
keep_white_spaces=True,
)

def _split_into_units(
self, text: str, split_by: Literal["function", "page", "passage", "sentence", "word", "line"]
self, text: str, split_by: Literal["function", "page", "passage", "period", "sentence", "word", "line"]
) -> List[str]:
"""
Splits the text into units based on the specified split_by parameter.
Expand All @@ -106,6 +113,7 @@ def _split_into_units(
# whitespace is preserved while splitting text into sentences when using keep_white_spaces=True
# so split_at is set to an empty string
self.split_at = ""
assert self.sentence_splitter is not None
result = self.sentence_splitter.split_sentences(text)
units = [sentence["sentence"] for sentence in result]
elif split_by == "word":
Expand Down Expand Up @@ -142,6 +150,11 @@ def run(self, documents: List[Document]) -> Dict[str, List[Document]]:
:raises TypeError: if the input is not a list of Documents.
:raises ValueError: if the content of a document is None.
"""
if self.sentence_splitter is None:
raise RuntimeError(
"The component NLTKDocumentSplitter wasn't warmed up. Run 'warm_up()' before calling 'run()'."
)

if not isinstance(documents, list) or (documents and not isinstance(documents[0], Document)):
raise TypeError("DocumentSplitter expects a List of Documents as input.")

Expand Down Expand Up @@ -221,8 +234,9 @@ def _number_of_sentences_to_keep(sentences: List[str], split_length: int, split_
break
return num_sentences_to_keep

@staticmethod
def _concatenate_sentences_based_on_word_amount(
self, sentences: List[str], split_length: int, split_overlap: int
sentences: List[str], split_length: int, split_overlap: int
) -> Tuple[List[str], List[int], List[int]]:
"""
Groups the sentences into chunks of `split_length` words while respecting sentence boundaries.
Expand Down Expand Up @@ -258,7 +272,7 @@ def _concatenate_sentences_based_on_word_amount(
split_start_indices.append(chunk_start_idx)

# Get the number of sentences that overlap with the next chunk
num_sentences_to_keep = self._number_of_sentences_to_keep(
num_sentences_to_keep = NLTKDocumentSplitter._number_of_sentences_to_keep(
sentences=current_chunk, split_length=split_length, split_overlap=split_overlap
)
# Set up information for the new chunk
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
---
fixes:
- |
Moved the NLTK download of DocumentSplitter and NLTKDocumentSplitter to warm_up(). This prevents calling to an external api during instantiation. If a DocumentSplitter or NLTKDocumentSplitter is used for sentence splitting outside of a pipeline, warm_up() now needs to be called before running the component.
34 changes: 34 additions & 0 deletions test/components/preprocessors/test_document_splitter.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,16 +44,19 @@ def test_non_text_document(self):
ValueError, match="DocumentSplitter only works with text documents but content for document ID"
):
splitter = DocumentSplitter()
splitter.warm_up()
splitter.run(documents=[Document()])
assert "DocumentSplitter only works with text documents but content for document ID" in caplog.text

def test_single_doc(self):
with pytest.raises(TypeError, match="DocumentSplitter expects a List of Documents as input."):
splitter = DocumentSplitter()
splitter.warm_up()
splitter.run(documents=Document())

def test_empty_list(self):
splitter = DocumentSplitter()
splitter.warm_up()
res = splitter.run(documents=[])
assert res == {"documents": []}

Expand All @@ -76,6 +79,7 @@ def test_unsupported_split_overlap(self):
def test_split_by_word(self):
splitter = DocumentSplitter(split_by="word", split_length=10)
text = "This is a text with some words. There is a second sentence. And there is a third sentence."
splitter.warm_up()
result = splitter.run(documents=[Document(content=text)])
docs = result["documents"]
assert len(docs) == 2
Expand All @@ -88,6 +92,7 @@ def test_split_by_word(self):

def test_split_by_word_with_threshold(self):
splitter = DocumentSplitter(split_by="word", split_length=15, split_threshold=10)
splitter.warm_up()
result = splitter.run(
documents=[
Document(
Expand All @@ -105,6 +110,7 @@ def test_split_by_word_multiple_input_docs(self):
splitter = DocumentSplitter(split_by="word", split_length=10)
text1 = "This is a text with some words. There is a second sentence. And there is a third sentence."
text2 = "This is a different text with some words. There is a second sentence. And there is a third sentence. And there is a fourth sentence."
splitter.warm_up()
result = splitter.run(documents=[Document(content=text1), Document(content=text2)])
docs = result["documents"]
assert len(docs) == 5
Expand Down Expand Up @@ -132,6 +138,7 @@ def test_split_by_word_multiple_input_docs(self):
def test_split_by_period(self):
splitter = DocumentSplitter(split_by="period", split_length=1)
text = "This is a text with some words. There is a second sentence. And there is a third sentence."
splitter.warm_up()
result = splitter.run(documents=[Document(content=text)])
docs = result["documents"]
assert len(docs) == 3
Expand All @@ -148,6 +155,7 @@ def test_split_by_period(self):
def test_split_by_passage(self):
splitter = DocumentSplitter(split_by="passage", split_length=1)
text = "This is a text with some words. There is a second sentence.\n\nAnd there is a third sentence.\n\n And another passage."
splitter.warm_up()
result = splitter.run(documents=[Document(content=text)])
docs = result["documents"]
assert len(docs) == 3
Expand All @@ -164,6 +172,7 @@ def test_split_by_passage(self):
def test_split_by_page(self):
splitter = DocumentSplitter(split_by="page", split_length=1)
text = "This is a text with some words. There is a second sentence.\f And there is a third sentence.\f And another passage."
splitter.warm_up()
result = splitter.run(documents=[Document(content=text)])
docs = result["documents"]
assert len(docs) == 3
Expand All @@ -183,6 +192,7 @@ def test_split_by_page(self):
def test_split_by_function(self):
splitting_function = lambda s: s.split(".")
splitter = DocumentSplitter(split_by="function", splitting_function=splitting_function)
splitter.warm_up()
text = "This.Is.A.Test"
result = splitter.run(documents=[Document(id="1", content=text, meta={"key": "value"})])
docs = result["documents"]
Expand All @@ -200,6 +210,7 @@ def test_split_by_function(self):
splitting_function = lambda s: re.split(r"[\s]{2,}", s)
splitter = DocumentSplitter(split_by="function", splitting_function=splitting_function)
text = "This Is\n A Test"
splitter.warm_up()
result = splitter.run(documents=[Document(id="1", content=text, meta={"key": "value"})])
docs = result["documents"]
assert len(docs) == 4
Expand All @@ -215,6 +226,7 @@ def test_split_by_function(self):
def test_split_by_word_with_overlap(self):
splitter = DocumentSplitter(split_by="word", split_length=10, split_overlap=2)
text = "This is a text with some words. There is a second sentence. And there is a third sentence."
splitter.warm_up()
result = splitter.run(documents=[Document(content=text)])
docs = result["documents"]
assert len(docs) == 2
Expand All @@ -234,6 +246,7 @@ def test_split_by_word_with_overlap(self):
def test_split_by_line(self):
splitter = DocumentSplitter(split_by="line", split_length=1)
text = "This is a text with some words.\nThere is a second sentence.\nAnd there is a third sentence."
splitter.warm_up()
result = splitter.run(documents=[Document(content=text)])
docs = result["documents"]

Expand All @@ -252,6 +265,7 @@ def test_source_id_stored_in_metadata(self):
splitter = DocumentSplitter(split_by="word", split_length=10)
doc1 = Document(content="This is a text with some words.")
doc2 = Document(content="This is a different text with some words.")
splitter.warm_up()
result = splitter.run(documents=[doc1, doc2])
assert result["documents"][0].meta["source_id"] == doc1.id
assert result["documents"][1].meta["source_id"] == doc2.id
Expand All @@ -262,6 +276,7 @@ def test_copy_metadata(self):
Document(content="Text.", meta={"name": "doc 0"}),
Document(content="Text.", meta={"name": "doc 1"}),
]
splitter.warm_up()
result = splitter.run(documents=documents)
assert len(result["documents"]) == 2
assert result["documents"][0].id != result["documents"][1].id
Expand All @@ -273,6 +288,7 @@ def test_add_page_number_to_metadata_with_no_overlap_word_split(self):
splitter = DocumentSplitter(split_by="word", split_length=2)
doc1 = Document(content="This is some text.\f This text is on another page.")
doc2 = Document(content="This content has two.\f\f page brakes.")
splitter.warm_up()
result = splitter.run(documents=[doc1, doc2])

expected_pages = [1, 1, 2, 2, 2, 1, 1, 3]
Expand All @@ -283,6 +299,7 @@ def test_add_page_number_to_metadata_with_no_overlap_period_split(self):
splitter = DocumentSplitter(split_by="period", split_length=1)
doc1 = Document(content="This is some text.\f This text is on another page.")
doc2 = Document(content="This content has two.\f\f page brakes.")
splitter.warm_up()
result = splitter.run(documents=[doc1, doc2])

expected_pages = [1, 1, 1, 1]
Expand All @@ -294,6 +311,7 @@ def test_add_page_number_to_metadata_with_no_overlap_passage_split(self):
doc1 = Document(
content="This is a text with some words.\f There is a second sentence.\n\nAnd there is a third sentence.\n\nAnd more passages.\n\n\f And another passage."
)
splitter.warm_up()
result = splitter.run(documents=[doc1])

expected_pages = [1, 2, 2, 2]
Expand All @@ -305,6 +323,7 @@ def test_add_page_number_to_metadata_with_no_overlap_page_split(self):
doc1 = Document(
content="This is a text with some words. There is a second sentence.\f And there is a third sentence.\f And another passage."
)
splitter.warm_up()
result = splitter.run(documents=[doc1])
expected_pages = [1, 2, 3]
for doc, p in zip(result["documents"], expected_pages):
Expand All @@ -314,6 +333,7 @@ def test_add_page_number_to_metadata_with_no_overlap_page_split(self):
doc1 = Document(
content="This is a text with some words. There is a second sentence.\f And there is a third sentence.\f And another passage."
)
splitter.warm_up()
result = splitter.run(documents=[doc1])
expected_pages = [1, 3]

Expand All @@ -324,6 +344,7 @@ def test_add_page_number_to_metadata_with_overlap_word_split(self):
splitter = DocumentSplitter(split_by="word", split_length=3, split_overlap=1)
doc1 = Document(content="This is some text. And\f this text is on another page.")
doc2 = Document(content="This content has two.\f\f page brakes.")
splitter.warm_up()
result = splitter.run(documents=[doc1, doc2])

expected_pages = [1, 1, 1, 2, 2, 1, 1, 3]
Expand All @@ -334,6 +355,7 @@ def test_add_page_number_to_metadata_with_overlap_period_split(self):
splitter = DocumentSplitter(split_by="period", split_length=2, split_overlap=1)
doc1 = Document(content="This is some text. And this is more text.\f This text is on another page. End.")
doc2 = Document(content="This content has two.\f\f page brakes. More text.")
splitter.warm_up()
result = splitter.run(documents=[doc1, doc2])

expected_pages = [1, 1, 1, 2, 1, 1]
Expand All @@ -345,6 +367,7 @@ def test_add_page_number_to_metadata_with_overlap_passage_split(self):
doc1 = Document(
content="This is a text with some words.\f There is a second sentence.\n\nAnd there is a third sentence.\n\nAnd more passages.\n\n\f And another passage."
)
splitter.warm_up()
result = splitter.run(documents=[doc1])

expected_pages = [1, 2, 2]
Expand All @@ -356,6 +379,7 @@ def test_add_page_number_to_metadata_with_overlap_page_split(self):
doc1 = Document(
content="This is a text with some words. There is a second sentence.\f And there is a third sentence.\f And another passage."
)
splitter.warm_up()
result = splitter.run(documents=[doc1])
expected_pages = [1, 2, 3]

Expand All @@ -366,6 +390,7 @@ def test_add_split_overlap_information(self):
splitter = DocumentSplitter(split_length=10, split_overlap=5, split_by="word")
text = "This is a text with some words. There is a second sentence. And a third sentence."
doc = Document(content="This is a text with some words. There is a second sentence. And a third sentence.")
splitter.warm_up()
docs = splitter.run(documents=[doc])["documents"]

# check split_overlap is added to all the documents
Expand Down Expand Up @@ -487,6 +512,7 @@ def test_run_empty_document(self):
"""
splitter = DocumentSplitter()
doc = Document(content="")
splitter.warm_up()
results = splitter.run([doc])
assert results["documents"] == []

Expand All @@ -496,6 +522,7 @@ def test_run_document_only_whitespaces(self):
"""
splitter = DocumentSplitter()
doc = Document(content=" ")
splitter.warm_up()
results = splitter.run([doc])
assert results["documents"][0].content == " "

Expand Down Expand Up @@ -543,6 +570,7 @@ def test_run_split_by_sentence_1(self) -> None:
"Moonlight shimmered softly, wolves howled nearby, night enveloped everything. It was a dark night ... "
"The moon was full."
)
document_splitter.warm_up()
documents = document_splitter.run(documents=[Document(content=text)])["documents"]

assert len(documents) == 2
Expand All @@ -568,6 +596,7 @@ def test_run_split_by_sentence_2(self) -> None:
"This is another test sentence. (This is a third test sentence.) "
"This is the last test sentence."
)
document_splitter.warm_up()
documents = document_splitter.run(documents=[Document(content=text)])["documents"]

assert len(documents) == 4
Expand Down Expand Up @@ -601,6 +630,7 @@ def test_run_split_by_sentence_3(self) -> None:
use_split_rules=True,
extend_abbreviations=True,
)
document_splitter.warm_up()

text = "Sentence on page 1.\fSentence on page 2. \fSentence on page 3. \f\f Sentence on page 5."
documents = document_splitter.run(documents=[Document(content=text)])["documents"]
Expand Down Expand Up @@ -633,6 +663,7 @@ def test_run_split_by_sentence_4(self) -> None:
use_split_rules=True,
extend_abbreviations=True,
)
document_splitter.warm_up()

text = "Sentence on page 1.\fSentence on page 2. \fSentence on page 3. \f\f Sentence on page 5."
documents = document_splitter.run(documents=[Document(content=text)])["documents"]
Expand Down Expand Up @@ -660,6 +691,7 @@ def test_run_split_by_word_respect_sentence_boundary(self) -> None:
language="en",
respect_sentence_boundary=True,
)
document_splitter.warm_up()

text = (
"Moonlight shimmered softly, wolves howled nearby, night enveloped everything. It was a dark night.\f"
Expand Down Expand Up @@ -692,6 +724,7 @@ def test_run_split_by_word_respect_sentence_boundary_no_repeats(self) -> None:
use_split_rules=False,
extend_abbreviations=False,
)
document_splitter.warm_up()
text = (
"This is a test sentence with many many words that exceeds the split length and should not be repeated. "
"This is another test sentence. (This is a third test sentence.) "
Expand All @@ -717,6 +750,7 @@ def test_run_split_by_word_respect_sentence_boundary_with_split_overlap_and_page
extend_abbreviations=True,
respect_sentence_boundary=True,
)
document_splitter.warm_up()

text = (
"Sentence on page 1. Another on page 1.\fSentence on page 2. Another on page 2.\f"
Expand Down
Loading

0 comments on commit 286061f

Please sign in to comment.