From 65ef6794504a420b373b656585fe58772b5080ec Mon Sep 17 00:00:00 2001 From: Vladimir Blagojevic Date: Sun, 8 Oct 2023 12:06:54 +0200 Subject: [PATCH] Update LinkContentFetcher, streams will be routed by FileTypeRouter --- .../components/fetchers/link_content.py | 12 +++--- .../fetchers/test_link_content_fetcher.py | 37 +++++++++---------- 2 files changed, 23 insertions(+), 26 deletions(-) diff --git a/haystack/preview/components/fetchers/link_content.py b/haystack/preview/components/fetchers/link_content.py index c8918e1c5a..caaf6a101a 100644 --- a/haystack/preview/components/fetchers/link_content.py +++ b/haystack/preview/components/fetchers/link_content.py @@ -119,15 +119,13 @@ def from_dict(cls, data: Dict[str, Any]) -> "LinkContentFetcher": @component.output_types(streams=List[ByteStream]) def run(self, urls: List[str]): """ - Fetches content from a list of URLs and returns a dictionary of extracted content streams. - For each content type there will be one outgoing edge created with value of List[ByteStream]. + Fetches content from a list of URLs and returns a list of extracted content streams. + Each content stream is a ByteStream object containing the extracted content as binary data. + The content type of each stream is stored in the metadata of the ByteStream object under + the key "content_type". :param urls: A list of URLs to fetch content from. - - - :return: A dictionary containing content streams categorized by content type. - The keys are content types (e.g., "text/html", "text/plain", "application/pdf"), - and the values are lists of ByteStream objects representing the extracted content. + :return: A lists of ByteStream objects representing the extracted content. """ streams = [] if not urls: diff --git a/test/preview/components/fetchers/test_link_content_fetcher.py b/test/preview/components/fetchers/test_link_content_fetcher.py index a0220bbd18..886f8fdc02 100644 --- a/test/preview/components/fetchers/test_link_content_fetcher.py +++ b/test/preview/components/fetchers/test_link_content_fetcher.py @@ -105,7 +105,7 @@ def test_run_text(self): ) fetcher = LinkContentFetcher() streams = fetcher.run(urls=["https://www.example.com"])["streams"] - assert streams["text/plain"][0].data == correct_response + assert streams[0].data == correct_response @pytest.mark.unit def test_run_html(self): @@ -116,7 +116,7 @@ def test_run_html(self): ) fetcher = LinkContentFetcher() streams = fetcher.run(urls=["https://www.example.com"])["streams"] - assert streams["text/html"][0].data == correct_response + assert streams[0].data == correct_response @pytest.mark.unit def test_run_binary(self, test_files_path): @@ -127,7 +127,7 @@ def test_run_binary(self, test_files_path): ) fetcher = LinkContentFetcher() streams = fetcher.run(urls=["https://www.example.com"])["streams"] - assert streams["application/pdf"][0].data == file_bytes + assert streams[0].data == file_bytes @pytest.mark.unit def test_run_bad_status_code(self): @@ -140,25 +140,25 @@ def test_run_bad_status_code(self): # empty byte stream is returned because raise_on_failure is False assert len(streams) == 1 - assert streams["text/html"][0].data == empty_byte_stream + assert streams[0].data == empty_byte_stream @pytest.mark.integration def test_link_content_fetcher_html(self): fetcher = LinkContentFetcher() streams = fetcher.run([HTML_URL])["streams"] - assert "Haystack" in streams["text/html"][0].data.decode("utf-8") + assert "Haystack" in streams[0].data.decode("utf-8") @pytest.mark.integration def test_link_content_fetcher_text(self): fetcher = LinkContentFetcher() streams = fetcher.run([TEXT_URL])["streams"] - assert "Haystack" in streams["text/plain"][0].data.decode("utf-8") + assert "Haystack" in streams[0].data.decode("utf-8") @pytest.mark.integration def test_link_content_fetcher_pdf(self): fetcher = LinkContentFetcher() streams = fetcher.run([PDF_URL])["streams"] - assert len(streams["application/pdf"]) == 1 or len(streams["application/octet-stream"]) == 1 + assert len(streams) == 1 @pytest.mark.integration def test_link_content_fetcher_multiple_different_content_types(self): @@ -168,8 +168,11 @@ def test_link_content_fetcher_multiple_different_content_types(self): fetcher = LinkContentFetcher() streams = fetcher.run([PDF_URL, HTML_URL])["streams"] assert len(streams) == 2 - assert len(streams["application/pdf"]) == 1 or len(streams["application/octet-stream"]) == 1 - assert "Haystack" in streams["text/html"][0].data.decode("utf-8") + for stream in streams: + if stream.metadata["content_type"] == "text/html": + assert "Haystack" in stream.data.decode("utf-8") + elif stream.metadata["content_type"] == "application/pdf": + assert len(stream.data) > 0 @pytest.mark.integration def test_link_content_fetcher_multiple_different_content_types_v2(self): @@ -180,13 +183,9 @@ def test_link_content_fetcher_multiple_different_content_types_v2(self): fetcher = LinkContentFetcher() streams = fetcher.run([PDF_URL, HTML_URL, "https://google.com"])["streams"] - assert len(streams) == 2 - assert len(streams["text/html"]) == 2 - assert len(streams["application/pdf"]) == 1 or len(streams["application/octet-stream"]) == 1 - assert "Haystack" in streams["text/html"][0].data.decode("utf-8") or "Haystack" in streams["text/html"][ - 1 - ].data.decode("utf-8") - - assert "Search" in streams["text/html"][1].data.decode("utf-8") or "Search" in streams["text/html"][ - 0 - ].data.decode("utf-8") + assert len(streams) == 3 + for stream in streams: + if stream.metadata["content_type"] == "text/html": + assert "Haystack" in stream.data.decode("utf-8") or "Google" in stream.data.decode("utf-8") + elif stream.metadata["content_type"] == "application/pdf": + assert len(stream.data) > 0