-
Notifications
You must be signed in to change notification settings - Fork 1.4k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Showing
8 changed files
with
206 additions
and
37 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
120 changes: 120 additions & 0 deletions
120
backend/tests/unit/danswer/indexing/test_indexing_pipeline.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,120 @@ | ||
from typing import List | ||
|
||
from danswer.configs.app_configs import MAX_DOCUMENT_CHARS | ||
from danswer.connectors.models import Document | ||
from danswer.connectors.models import DocumentSource | ||
from danswer.connectors.models import Section | ||
from danswer.indexing.indexing_pipeline import filter_documents | ||
|
||
|
||
def create_test_document( | ||
doc_id: str = "test_id", | ||
title: str | None = "Test Title", | ||
semantic_id: str = "test_semantic_id", | ||
sections: List[Section] | None = None, | ||
) -> Document: | ||
if sections is None: | ||
sections = [Section(text="Test content", link="test_link")] | ||
return Document( | ||
id=doc_id, | ||
title=title, | ||
semantic_identifier=semantic_id, | ||
sections=sections, | ||
source=DocumentSource.FILE, | ||
metadata={}, | ||
) | ||
|
||
|
||
def test_filter_documents_empty_title_and_content() -> None: | ||
doc = create_test_document( | ||
title="", semantic_id="", sections=[Section(text="", link="test_link")] | ||
) | ||
result = filter_documents([doc]) | ||
assert len(result) == 0 | ||
|
||
|
||
def test_filter_documents_empty_title_with_content() -> None: | ||
doc = create_test_document( | ||
title="", sections=[Section(text="Valid content", link="test_link")] | ||
) | ||
result = filter_documents([doc]) | ||
assert len(result) == 1 | ||
assert result[0].id == "test_id" | ||
|
||
|
||
def test_filter_documents_empty_content_with_title() -> None: | ||
doc = create_test_document( | ||
title="Valid Title", sections=[Section(text="", link="test_link")] | ||
) | ||
result = filter_documents([doc]) | ||
assert len(result) == 1 | ||
assert result[0].id == "test_id" | ||
|
||
|
||
def test_filter_documents_exceeding_max_chars() -> None: | ||
if not MAX_DOCUMENT_CHARS: # Skip if no max chars configured | ||
return | ||
long_text = "a" * (MAX_DOCUMENT_CHARS + 1) | ||
doc = create_test_document(sections=[Section(text=long_text, link="test_link")]) | ||
result = filter_documents([doc]) | ||
assert len(result) == 0 | ||
|
||
|
||
def test_filter_documents_valid_document() -> None: | ||
doc = create_test_document( | ||
title="Valid Title", sections=[Section(text="Valid content", link="test_link")] | ||
) | ||
result = filter_documents([doc]) | ||
assert len(result) == 1 | ||
assert result[0].id == "test_id" | ||
assert result[0].title == "Valid Title" | ||
|
||
|
||
def test_filter_documents_whitespace_only() -> None: | ||
doc = create_test_document( | ||
title=" ", semantic_id=" ", sections=[Section(text=" ", link="test_link")] | ||
) | ||
result = filter_documents([doc]) | ||
assert len(result) == 0 | ||
|
||
|
||
def test_filter_documents_semantic_id_no_title() -> None: | ||
doc = create_test_document( | ||
title=None, | ||
semantic_id="Valid Semantic ID", | ||
sections=[Section(text="Valid content", link="test_link")], | ||
) | ||
result = filter_documents([doc]) | ||
assert len(result) == 1 | ||
assert result[0].semantic_identifier == "Valid Semantic ID" | ||
|
||
|
||
def test_filter_documents_multiple_sections() -> None: | ||
doc = create_test_document( | ||
sections=[ | ||
Section(text="Content 1", link="test_link"), | ||
Section(text="Content 2", link="test_link"), | ||
Section(text="Content 3", link="test_link"), | ||
] | ||
) | ||
result = filter_documents([doc]) | ||
assert len(result) == 1 | ||
assert len(result[0].sections) == 3 | ||
|
||
|
||
def test_filter_documents_multiple_documents() -> None: | ||
docs = [ | ||
create_test_document(doc_id="1", title="Title 1"), | ||
create_test_document( | ||
doc_id="2", title="", sections=[Section(text="", link="test_link")] | ||
), # Should be filtered | ||
create_test_document(doc_id="3", title="Title 3"), | ||
] | ||
result = filter_documents(docs) | ||
assert len(result) == 2 | ||
assert {doc.id for doc in result} == {"1", "3"} | ||
|
||
|
||
def test_filter_documents_empty_batch() -> None: | ||
result = filter_documents([]) | ||
assert len(result) == 0 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters