Skip to content

Commit

Permalink
Update LinkContentFetcher, streams will be routed by FileTypeRouter
Browse files Browse the repository at this point in the history
  • Loading branch information
vblagoje committed Oct 8, 2023
1 parent 859308e commit 65ef679
Show file tree
Hide file tree
Showing 2 changed files with 23 additions and 26 deletions.
12 changes: 5 additions & 7 deletions haystack/preview/components/fetchers/link_content.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,15 +119,13 @@ def from_dict(cls, data: Dict[str, Any]) -> "LinkContentFetcher":
@component.output_types(streams=List[ByteStream])
def run(self, urls: List[str]):
"""
Fetches content from a list of URLs and returns a dictionary of extracted content streams.
For each content type there will be one outgoing edge created with value of List[ByteStream].
Fetches content from a list of URLs and returns a list of extracted content streams.
Each content stream is a ByteStream object containing the extracted content as binary data.
The content type of each stream is stored in the metadata of the ByteStream object under
the key "content_type".
:param urls: A list of URLs to fetch content from.
:return: A dictionary containing content streams categorized by content type.
The keys are content types (e.g., "text/html", "text/plain", "application/pdf"),
and the values are lists of ByteStream objects representing the extracted content.
:return: A lists of ByteStream objects representing the extracted content.
"""
streams = []
if not urls:
Expand Down
37 changes: 18 additions & 19 deletions test/preview/components/fetchers/test_link_content_fetcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ def test_run_text(self):
)
fetcher = LinkContentFetcher()
streams = fetcher.run(urls=["https://www.example.com"])["streams"]
assert streams["text/plain"][0].data == correct_response
assert streams[0].data == correct_response

@pytest.mark.unit
def test_run_html(self):
Expand All @@ -116,7 +116,7 @@ def test_run_html(self):
)
fetcher = LinkContentFetcher()
streams = fetcher.run(urls=["https://www.example.com"])["streams"]
assert streams["text/html"][0].data == correct_response
assert streams[0].data == correct_response

@pytest.mark.unit
def test_run_binary(self, test_files_path):
Expand All @@ -127,7 +127,7 @@ def test_run_binary(self, test_files_path):
)
fetcher = LinkContentFetcher()
streams = fetcher.run(urls=["https://www.example.com"])["streams"]
assert streams["application/pdf"][0].data == file_bytes
assert streams[0].data == file_bytes

@pytest.mark.unit
def test_run_bad_status_code(self):
Expand All @@ -140,25 +140,25 @@ def test_run_bad_status_code(self):

# empty byte stream is returned because raise_on_failure is False
assert len(streams) == 1
assert streams["text/html"][0].data == empty_byte_stream
assert streams[0].data == empty_byte_stream

@pytest.mark.integration
def test_link_content_fetcher_html(self):
fetcher = LinkContentFetcher()
streams = fetcher.run([HTML_URL])["streams"]
assert "Haystack" in streams["text/html"][0].data.decode("utf-8")
assert "Haystack" in streams[0].data.decode("utf-8")

@pytest.mark.integration
def test_link_content_fetcher_text(self):
fetcher = LinkContentFetcher()
streams = fetcher.run([TEXT_URL])["streams"]
assert "Haystack" in streams["text/plain"][0].data.decode("utf-8")
assert "Haystack" in streams[0].data.decode("utf-8")

@pytest.mark.integration
def test_link_content_fetcher_pdf(self):
fetcher = LinkContentFetcher()
streams = fetcher.run([PDF_URL])["streams"]
assert len(streams["application/pdf"]) == 1 or len(streams["application/octet-stream"]) == 1
assert len(streams) == 1

@pytest.mark.integration
def test_link_content_fetcher_multiple_different_content_types(self):
Expand All @@ -168,8 +168,11 @@ def test_link_content_fetcher_multiple_different_content_types(self):
fetcher = LinkContentFetcher()
streams = fetcher.run([PDF_URL, HTML_URL])["streams"]
assert len(streams) == 2
assert len(streams["application/pdf"]) == 1 or len(streams["application/octet-stream"]) == 1
assert "Haystack" in streams["text/html"][0].data.decode("utf-8")
for stream in streams:
if stream.metadata["content_type"] == "text/html":
assert "Haystack" in stream.data.decode("utf-8")
elif stream.metadata["content_type"] == "application/pdf":
assert len(stream.data) > 0

@pytest.mark.integration
def test_link_content_fetcher_multiple_different_content_types_v2(self):
Expand All @@ -180,13 +183,9 @@ def test_link_content_fetcher_multiple_different_content_types_v2(self):

fetcher = LinkContentFetcher()
streams = fetcher.run([PDF_URL, HTML_URL, "https://google.com"])["streams"]
assert len(streams) == 2
assert len(streams["text/html"]) == 2
assert len(streams["application/pdf"]) == 1 or len(streams["application/octet-stream"]) == 1
assert "Haystack" in streams["text/html"][0].data.decode("utf-8") or "Haystack" in streams["text/html"][
1
].data.decode("utf-8")

assert "Search" in streams["text/html"][1].data.decode("utf-8") or "Search" in streams["text/html"][
0
].data.decode("utf-8")
assert len(streams) == 3
for stream in streams:
if stream.metadata["content_type"] == "text/html":
assert "Haystack" in stream.data.decode("utf-8") or "Google" in stream.data.decode("utf-8")
elif stream.metadata["content_type"] == "application/pdf":
assert len(stream.data) > 0

0 comments on commit 65ef679

Please sign in to comment.