chore: remove PyMuPDF dependency (#7658)

* remove PyMuPDF dependency * fix import path * linter * adjust link content fetcher tests * remove pymupdf tests * fix relnote check * add relnotes
deepset-ai · May 7, 2024 · 62167b1 · 62167b1
1 parent 2cadee5
commit 62167b1
Show file tree

Hide file tree

Showing 10 changed files with 14 additions and 470 deletions.
diff --git a/.github/workflows/license_compliance.yml b/.github/workflows/license_compliance.yml
@@ -42,10 +42,9 @@ jobs:
           # Exclusions in the vanilla distribution must be explicitly motivated
           #
           # - tqdm is MLP but there are no better alternatives
-          # - PyMuPDF is optional
           # - pinecone-client is optional
           # - psycopg2 is optional
-          exclude: "(?i)^(PyMuPDF|tqdm|pinecone-client|psycopg2).*"
+          exclude: "(?i)^(tqdm|pinecone-client|psycopg2).*"
 
       # We keep the license inventory on FOSSA
       - name: Send license report to Fossa
@@ -199,7 +198,7 @@ jobs:
           # Special cases:
           # - pyzmq is flagged because dual-licensed, but we assume using BSD
           # - tqdm is MLP but there are no better alternatives
-          exclude: "(?i)^(astroid|certifi|chardet|num2words|nvidia-|pathspec|pinecone-client|psycopg2|pylint|PyMuPDF|pyzmq|tqdm).*"
+          exclude: "(?i)^(astroid|certifi|chardet|num2words|nvidia-|pathspec|pinecone-client|psycopg2|pylint|pyzmq|tqdm).*"
 
       - name: Print report
         if: ${{ always() }}
@@ -272,7 +271,7 @@ jobs:
           # Special cases:
           # - pyzmq is flagged because dual-licensed, but we assume using BSD
           # - tqdm is MLP but there are no better alternatives
-          exclude: "(?i)^(astroid|certifi|chardet|num2words|nvidia-|pathspec|pinecone-client|psycopg2|pylint|PyMuPDF|pyzmq|tqdm).*"
+          exclude: "(?i)^(astroid|certifi|chardet|num2words|nvidia-|pathspec|pinecone-client|psycopg2|pylint|pyzmq|tqdm).*"
 
       - name: Print report
         if: ${{ always() }}

diff --git a/.github/workflows/release_notes.yml b/.github/workflows/release_notes.yml
@@ -36,7 +36,7 @@ jobs:
         if: steps.changed-files.outputs.any_changed == 'false' && !contains( github.event.pull_request.labels.*.name, 'ignore-for-release-notes')
         run: |
           # Check if any of the commit messages contain tags ci/docs/test
-          if git log --pretty=%s origin/main..HEAD | grep -E '^(ci:|docs:|test:)' > /dev/null; then
+          if git log --pretty=%s origin/v1.x..HEAD | grep -E '^(ci:|docs:|test:)' > /dev/null; then
             echo "Skipping release note check for commits with 'ci:', 'docs:', or 'test:' tags."
           else
             echo "::error::The release notes file is missing, please add one or attach the label 'ignore-for-release-notes' to this PR."

diff --git a/haystack/nodes/file_converter/__init__.py b/haystack/nodes/file_converter/__init__.py
@@ -9,16 +9,6 @@
 from haystack.nodes.file_converter.txt import TextConverter
 from haystack.nodes.file_converter.azure import AzureConverter
 from haystack.nodes.file_converter.parsr import ParsrConverter
-
-
-try:
-    with LazyImport() as fitz_import:
-        # Try to use PyMuPDF, if not available fall back to xpdf
-        from haystack.nodes.file_converter.pdf import PDFToTextConverter  # type: ignore
-
-    fitz_import.check()
-except (ModuleNotFoundError, ImportError):
-    from haystack.nodes.file_converter.pdf_xpdf import PDFToTextConverter  # type: ignore  # pylint: disable=reimported,ungrouped-imports
-
+from haystack.nodes.file_converter.pdf_xpdf import PDFToTextConverter
 from haystack.nodes.file_converter.markdown import MarkdownConverter
 from haystack.nodes.file_converter.image import ImageToTextConverter
diff --git a/haystack/nodes/file_converter/pdf.py b/haystack/nodes/file_converter/pdf.py
diff --git a/haystack/nodes/retriever/link_content.py b/haystack/nodes/retriever/link_content.py
@@ -1,5 +1,4 @@
 import inspect
-import io
 import logging
 from collections import defaultdict
 from datetime import datetime
@@ -14,15 +13,11 @@
 from tenacity import retry, stop_after_attempt, wait_exponential, retry_if_exception_type, RetryCallState
 
 from haystack import __version__
-from haystack.lazy_imports import LazyImport
 from haystack.nodes import PreProcessor, BaseComponent
 from haystack.schema import Document, MultiLabel
 
 logger = logging.getLogger(__name__)
 
-with LazyImport("Run 'pip install farm-haystack[pdf]'") as fitz_import:
-    import fitz
-
 
 def html_content_handler(response: Response) -> Optional[str]:
     """
@@ -34,20 +29,6 @@ def html_content_handler(response: Response) -> Optional[str]:
     return extractor.get_content(response.text)
 
 
-def pdf_content_handler(response: Response) -> Optional[str]:
-    """
-    Extracts text from PDF response stream using the PyMuPDF library.
-
-    :param response: Response object from the request.
-    :return: The extracted text.
-    """
-    file_path = io.BytesIO(response.content)
-    with fitz.open(stream=file_path, filetype="pdf") as doc:
-        text = "\f".join([page.get_text() for page in doc])
-
-    return text.encode("ascii", errors="ignore").decode()
-
-
 class LinkContentFetcher(BaseComponent):
     """
     LinkContentFetcher fetches content from a URL and converts it into a list of Document objects.
@@ -153,8 +134,6 @@ def __init__(
 
         # register default content handlers
         self._register_content_handler("text/html", html_content_handler)
-        if fitz_import.is_successful():
-            self._register_content_handler("application/pdf", pdf_content_handler)
 
         # register custom content handlers, can override default handlers
         if content_handlers:

diff --git a/pyproject.toml b/pyproject.toml
@@ -161,9 +161,7 @@ file-conversion = [
   "python-magic; platform_system != 'Windows'",  # Depends on libmagic: https://pypi.org/project/python-magic/
   "python-magic-bin; platform_system == 'Windows'",  # Needs to be installed without python-magic, otherwise Windows CI gets stuck.
 ]
-pdf = [
-  "PyMuPDF>=1.18.16" ,  # PDF text extraction alternative to xpdf; please check AGPLv3 license
-]
+pdf = []
 ocr = [
   "pytesseract>0.3.7",
   "pdf2image>1.14",

diff --git a/releasenotes/notes/remove-pymupdf-15fc66b581538adb.yaml b/releasenotes/notes/remove-pymupdf-15fc66b581538adb.yaml
@@ -0,0 +1,7 @@
+---
+prelude: >
+    This release changes the `PDFToTextConverter` so that it doesn't support PyMuPDF anymore.
+    The converter will always assume `xpdf` is used by default.
+upgrade:
+  - |
+    To keep using PyMuPDF you must create a custom node, you can use the previous Haystack version for inspiration.
diff --git a/test/nodes/test_file_converter.py b/test/nodes/test_file_converter.py
@@ -47,125 +47,6 @@ def test_convert(Converter, samples_path):
     assert "Adobe Systems made the PDF specification available free of charge in 1993." in page_standard_whitespace
 
 
-@pytest.mark.unit
-@pytest.mark.parametrize("Converter", [PDFToTextConverter])
-def test_pdf_command_whitespaces(Converter, samples_path):
-    converter = Converter()
-
-    document = converter.run(file_paths=samples_path / "pdf" / "sample pdf file with spaces on file name.pdf")[0][
-        "documents"
-    ][0]
-    assert "ɪ" in document.content
-
-
-@pytest.mark.unit
-@pytest.mark.parametrize("Converter", [PDFToTextConverter])
-def test_pdf_encoding(Converter, samples_path):
-    converter = Converter()
-
-    document = converter.run(file_paths=samples_path / "pdf" / "sample_pdf_5.pdf")[0]["documents"][0]
-    assert "Ж" in document.content
-
-    document = converter.run(file_paths=samples_path / "pdf" / "sample_pdf_2.pdf")[0]["documents"][0]
-    assert "ɪ" in document.content
-
-
-@pytest.mark.unit
-@pytest.mark.parametrize("Converter", [PDFToTextConverter])
-def test_pdf_sort_by_position(Converter, samples_path):
-    converter = Converter(sort_by_position=True)
-
-    document = converter.convert(file_path=samples_path / "pdf" / "sample_pdf_3.pdf")[0]
-    assert str(document.content).startswith("This is the second test sentence.")
-
-
-@pytest.mark.unit
-@pytest.mark.parametrize("Converter", [PDFToTextConverter])
-def test_pdf_ligatures(Converter, samples_path):
-    converter = Converter()
-
-    document = converter.run(file_paths=samples_path / "pdf" / "sample_pdf_2.pdf")[0]["documents"][0]
-    assert "ﬀ" not in document.content
-    assert "ɪ" in document.content
-
-    document = converter.run(file_paths=samples_path / "pdf" / "sample_pdf_2.pdf", known_ligatures={})[0]["documents"][
-        0
-    ]
-    assert "ﬀ" in document.content
-    assert "ɪ" in document.content
-
-    document = converter.run(file_paths=samples_path / "pdf" / "sample_pdf_2.pdf", known_ligatures={"ɪ": "i"})[0][
-        "documents"
-    ][0]
-    assert "ﬀ" in document.content
-    assert "ɪ" not in document.content
-
-
-@pytest.mark.unit
-@pytest.mark.parametrize("Converter", [PDFToTextConverter])
-def test_pdf_page_range(Converter, samples_path):
-    converter = Converter()
-    document = converter.convert(file_path=samples_path / "pdf" / "sample_pdf_1.pdf", start_page=2)[0]
-    pages = document.content.split("\f")
-
-    assert (
-        len(pages) == 4
-    )  # the sample PDF file has four pages, we skipped first (but we wanna correct number of pages)
-    assert pages[0] == ""  # the page 1 was skipped.
-    assert pages[1] != ""  # the page 2 is not empty.
-    assert pages[2] == ""  # the page 3 is empty.
-
-
-@pytest.mark.unit
-@pytest.mark.parametrize("Converter", [PDFToTextConverter])
-def test_pdf_page_range_numbers(Converter, samples_path):
-    converter = Converter()
-    document = converter.convert(file_path=samples_path / "pdf" / "sample_pdf_1.pdf", start_page=2)[0]
-
-    preprocessor = PreProcessor(
-        split_by="word", split_length=5, split_overlap=0, split_respect_sentence_boundary=False, add_page_number=True
-    )
-    documents = preprocessor.process([document])
-
-    assert documents[1].meta["page"] == 4
-
-
-@pytest.mark.unit
-@pytest.mark.parametrize("Converter", [PDFToTextConverter])
-def test_pdf_parallel(Converter, samples_path):
-    converter = Converter(multiprocessing=True)
-    document = converter.convert(file_path=samples_path / "pdf" / "sample_pdf_6.pdf")[0]
-
-    pages = document.content.split("\f")
-
-    assert pages[0] == "This is the page 1 of the document."
-    assert pages[-1] == "This is the page 50 of the document."
-
-
-@pytest.mark.unit
-@pytest.mark.parametrize("Converter", [PDFToTextConverter])
-def test_pdf_parallel_page_range(Converter, samples_path):
-    converter = Converter(multiprocessing=True)
-    document = converter.convert(file_path=samples_path / "pdf" / "sample_pdf_6.pdf", start_page=2)[0]
-
-    pages = document.content.split("\f")
-
-    assert pages[0] == ""
-    assert len(pages) == 50
-
-
-@pytest.mark.unit
-@pytest.mark.parametrize("Converter", [PDFToTextConverter])
-def test_pdf_parallel_sort_by_position(Converter, samples_path):
-    converter = Converter(multiprocessing=True, sort_by_position=True)
-    document = converter.convert(file_path=samples_path / "pdf" / "sample_pdf_6.pdf")[0]
-
-    pages = document.content.split("\f")
-
-    assert pages[0] == "This is the page 1 of the document."
-    assert pages[-1] == "This is the page 50 of the document."
-
-
 @pytest.mark.integration
 @pytest.mark.parametrize("Converter", [PDFToTextConverter])
 def test_pdf_parallel_ocr(Converter, samples_path):

diff --git a/test/nodes/test_link_content_fetcher.py b/test/nodes/test_link_content_fetcher.py
@@ -36,7 +36,6 @@ def test_init():
     assert r.processor is None
     assert isinstance(r.handlers, dict)
     assert "text/html" in r.handlers
-    assert "application/pdf" in r.handlers
 
 
 @pytest.mark.unit
@@ -49,7 +48,6 @@ def test_init_with_preprocessor():
     assert r.processor == pre_processor_mock
     assert isinstance(r.handlers, dict)
     assert "text/html" in r.handlers
-    assert "application/pdf" in r.handlers
 
 
 @pytest.mark.unit
@@ -65,7 +63,6 @@ def fake_but_valid_video_content_handler(response: Response) -> Optional[str]:
 
     assert isinstance(r.handlers, dict)
     assert "text/html" in r.handlers
-    assert "application/pdf" in r.handlers
     assert "video/mp4" in r.handlers
 
 

diff --git a/test/nodes/test_preprocessor.py b/test/nodes/test_preprocessor.py
@@ -12,7 +12,7 @@
 from transformers import AutoTokenizer, PreTrainedTokenizerBase
 
 from haystack import Document
-from haystack.nodes.file_converter.pdf import PDFToTextConverter
+from haystack.nodes.file_converter.pdf_xpdf import PDFToTextConverter
 from haystack.nodes.preprocessor.preprocessor import PreProcessor