Skip to content

Commit

Permalink
chore: remove PyMuPDF dependency (#7658)
Browse files Browse the repository at this point in the history
* remove PyMuPDF dependency

* fix import path

* linter

* adjust link content fetcher tests

* remove pymupdf tests

* fix relnote check

* add relnotes
  • Loading branch information
masci authored May 7, 2024
1 parent 2cadee5 commit 62167b1
Show file tree
Hide file tree
Showing 10 changed files with 14 additions and 470 deletions.
7 changes: 3 additions & 4 deletions .github/workflows/license_compliance.yml
Original file line number Diff line number Diff line change
Expand Up @@ -42,10 +42,9 @@ jobs:
# Exclusions in the vanilla distribution must be explicitly motivated
#
# - tqdm is MLP but there are no better alternatives
# - PyMuPDF is optional
# - pinecone-client is optional
# - psycopg2 is optional
exclude: "(?i)^(PyMuPDF|tqdm|pinecone-client|psycopg2).*"
exclude: "(?i)^(tqdm|pinecone-client|psycopg2).*"

# We keep the license inventory on FOSSA
- name: Send license report to Fossa
Expand Down Expand Up @@ -199,7 +198,7 @@ jobs:
# Special cases:
# - pyzmq is flagged because dual-licensed, but we assume using BSD
# - tqdm is MLP but there are no better alternatives
exclude: "(?i)^(astroid|certifi|chardet|num2words|nvidia-|pathspec|pinecone-client|psycopg2|pylint|PyMuPDF|pyzmq|tqdm).*"
exclude: "(?i)^(astroid|certifi|chardet|num2words|nvidia-|pathspec|pinecone-client|psycopg2|pylint|pyzmq|tqdm).*"

- name: Print report
if: ${{ always() }}
Expand Down Expand Up @@ -272,7 +271,7 @@ jobs:
# Special cases:
# - pyzmq is flagged because dual-licensed, but we assume using BSD
# - tqdm is MLP but there are no better alternatives
exclude: "(?i)^(astroid|certifi|chardet|num2words|nvidia-|pathspec|pinecone-client|psycopg2|pylint|PyMuPDF|pyzmq|tqdm).*"
exclude: "(?i)^(astroid|certifi|chardet|num2words|nvidia-|pathspec|pinecone-client|psycopg2|pylint|pyzmq|tqdm).*"

- name: Print report
if: ${{ always() }}
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/release_notes.yml
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ jobs:
if: steps.changed-files.outputs.any_changed == 'false' && !contains( github.event.pull_request.labels.*.name, 'ignore-for-release-notes')
run: |
# Check if any of the commit messages contain tags ci/docs/test
if git log --pretty=%s origin/main..HEAD | grep -E '^(ci:|docs:|test:)' > /dev/null; then
if git log --pretty=%s origin/v1.x..HEAD | grep -E '^(ci:|docs:|test:)' > /dev/null; then
echo "Skipping release note check for commits with 'ci:', 'docs:', or 'test:' tags."
else
echo "::error::The release notes file is missing, please add one or attach the label 'ignore-for-release-notes' to this PR."
Expand Down
12 changes: 1 addition & 11 deletions haystack/nodes/file_converter/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,16 +9,6 @@
from haystack.nodes.file_converter.txt import TextConverter
from haystack.nodes.file_converter.azure import AzureConverter
from haystack.nodes.file_converter.parsr import ParsrConverter


try:
with LazyImport() as fitz_import:
# Try to use PyMuPDF, if not available fall back to xpdf
from haystack.nodes.file_converter.pdf import PDFToTextConverter # type: ignore

fitz_import.check()
except (ModuleNotFoundError, ImportError):
from haystack.nodes.file_converter.pdf_xpdf import PDFToTextConverter # type: ignore # pylint: disable=reimported,ungrouped-imports

from haystack.nodes.file_converter.pdf_xpdf import PDFToTextConverter
from haystack.nodes.file_converter.markdown import MarkdownConverter
from haystack.nodes.file_converter.image import ImageToTextConverter
307 changes: 0 additions & 307 deletions haystack/nodes/file_converter/pdf.py

This file was deleted.

21 changes: 0 additions & 21 deletions haystack/nodes/retriever/link_content.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import inspect
import io
import logging
from collections import defaultdict
from datetime import datetime
Expand All @@ -14,15 +13,11 @@
from tenacity import retry, stop_after_attempt, wait_exponential, retry_if_exception_type, RetryCallState

from haystack import __version__
from haystack.lazy_imports import LazyImport
from haystack.nodes import PreProcessor, BaseComponent
from haystack.schema import Document, MultiLabel

logger = logging.getLogger(__name__)

with LazyImport("Run 'pip install farm-haystack[pdf]'") as fitz_import:
import fitz


def html_content_handler(response: Response) -> Optional[str]:
"""
Expand All @@ -34,20 +29,6 @@ def html_content_handler(response: Response) -> Optional[str]:
return extractor.get_content(response.text)


def pdf_content_handler(response: Response) -> Optional[str]:
"""
Extracts text from PDF response stream using the PyMuPDF library.
:param response: Response object from the request.
:return: The extracted text.
"""
file_path = io.BytesIO(response.content)
with fitz.open(stream=file_path, filetype="pdf") as doc:
text = "\f".join([page.get_text() for page in doc])

return text.encode("ascii", errors="ignore").decode()


class LinkContentFetcher(BaseComponent):
"""
LinkContentFetcher fetches content from a URL and converts it into a list of Document objects.
Expand Down Expand Up @@ -153,8 +134,6 @@ def __init__(

# register default content handlers
self._register_content_handler("text/html", html_content_handler)
if fitz_import.is_successful():
self._register_content_handler("application/pdf", pdf_content_handler)

# register custom content handlers, can override default handlers
if content_handlers:
Expand Down
4 changes: 1 addition & 3 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -161,9 +161,7 @@ file-conversion = [
"python-magic; platform_system != 'Windows'", # Depends on libmagic: https://pypi.org/project/python-magic/
"python-magic-bin; platform_system == 'Windows'", # Needs to be installed without python-magic, otherwise Windows CI gets stuck.
]
pdf = [
"PyMuPDF>=1.18.16" , # PDF text extraction alternative to xpdf; please check AGPLv3 license
]
pdf = []
ocr = [
"pytesseract>0.3.7",
"pdf2image>1.14",
Expand Down
7 changes: 7 additions & 0 deletions releasenotes/notes/remove-pymupdf-15fc66b581538adb.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
---
prelude: >
This release changes the `PDFToTextConverter` so that it doesn't support PyMuPDF anymore.
The converter will always assume `xpdf` is used by default.
upgrade:
- |
To keep using PyMuPDF you must create a custom node, you can use the previous Haystack version for inspiration.
119 changes: 0 additions & 119 deletions test/nodes/test_file_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,125 +47,6 @@ def test_convert(Converter, samples_path):
assert "Adobe Systems made the PDF specification available free of charge in 1993." in page_standard_whitespace


@pytest.mark.unit
@pytest.mark.parametrize("Converter", [PDFToTextConverter])
def test_pdf_command_whitespaces(Converter, samples_path):
converter = Converter()

document = converter.run(file_paths=samples_path / "pdf" / "sample pdf file with spaces on file name.pdf")[0][
"documents"
][0]
assert "ɪ" in document.content


@pytest.mark.unit
@pytest.mark.parametrize("Converter", [PDFToTextConverter])
def test_pdf_encoding(Converter, samples_path):
converter = Converter()

document = converter.run(file_paths=samples_path / "pdf" / "sample_pdf_5.pdf")[0]["documents"][0]
assert "Ж" in document.content

document = converter.run(file_paths=samples_path / "pdf" / "sample_pdf_2.pdf")[0]["documents"][0]
assert "ɪ" in document.content


@pytest.mark.unit
@pytest.mark.parametrize("Converter", [PDFToTextConverter])
def test_pdf_sort_by_position(Converter, samples_path):
converter = Converter(sort_by_position=True)

document = converter.convert(file_path=samples_path / "pdf" / "sample_pdf_3.pdf")[0]
assert str(document.content).startswith("This is the second test sentence.")


@pytest.mark.unit
@pytest.mark.parametrize("Converter", [PDFToTextConverter])
def test_pdf_ligatures(Converter, samples_path):
converter = Converter()

document = converter.run(file_paths=samples_path / "pdf" / "sample_pdf_2.pdf")[0]["documents"][0]
assert "ff" not in document.content
assert "ɪ" in document.content

document = converter.run(file_paths=samples_path / "pdf" / "sample_pdf_2.pdf", known_ligatures={})[0]["documents"][
0
]
assert "ff" in document.content
assert "ɪ" in document.content

document = converter.run(file_paths=samples_path / "pdf" / "sample_pdf_2.pdf", known_ligatures={"ɪ": "i"})[0][
"documents"
][0]
assert "ff" in document.content
assert "ɪ" not in document.content


@pytest.mark.unit
@pytest.mark.parametrize("Converter", [PDFToTextConverter])
def test_pdf_page_range(Converter, samples_path):
converter = Converter()
document = converter.convert(file_path=samples_path / "pdf" / "sample_pdf_1.pdf", start_page=2)[0]
pages = document.content.split("\f")

assert (
len(pages) == 4
) # the sample PDF file has four pages, we skipped first (but we wanna correct number of pages)
assert pages[0] == "" # the page 1 was skipped.
assert pages[1] != "" # the page 2 is not empty.
assert pages[2] == "" # the page 3 is empty.


@pytest.mark.unit
@pytest.mark.parametrize("Converter", [PDFToTextConverter])
def test_pdf_page_range_numbers(Converter, samples_path):
converter = Converter()
document = converter.convert(file_path=samples_path / "pdf" / "sample_pdf_1.pdf", start_page=2)[0]

preprocessor = PreProcessor(
split_by="word", split_length=5, split_overlap=0, split_respect_sentence_boundary=False, add_page_number=True
)
documents = preprocessor.process([document])

assert documents[1].meta["page"] == 4


@pytest.mark.unit
@pytest.mark.parametrize("Converter", [PDFToTextConverter])
def test_pdf_parallel(Converter, samples_path):
converter = Converter(multiprocessing=True)
document = converter.convert(file_path=samples_path / "pdf" / "sample_pdf_6.pdf")[0]

pages = document.content.split("\f")

assert pages[0] == "This is the page 1 of the document."
assert pages[-1] == "This is the page 50 of the document."


@pytest.mark.unit
@pytest.mark.parametrize("Converter", [PDFToTextConverter])
def test_pdf_parallel_page_range(Converter, samples_path):
converter = Converter(multiprocessing=True)
document = converter.convert(file_path=samples_path / "pdf" / "sample_pdf_6.pdf", start_page=2)[0]

pages = document.content.split("\f")

assert pages[0] == ""
assert len(pages) == 50


@pytest.mark.unit
@pytest.mark.parametrize("Converter", [PDFToTextConverter])
def test_pdf_parallel_sort_by_position(Converter, samples_path):
converter = Converter(multiprocessing=True, sort_by_position=True)
document = converter.convert(file_path=samples_path / "pdf" / "sample_pdf_6.pdf")[0]

pages = document.content.split("\f")

assert pages[0] == "This is the page 1 of the document."
assert pages[-1] == "This is the page 50 of the document."


@pytest.mark.integration
@pytest.mark.parametrize("Converter", [PDFToTextConverter])
def test_pdf_parallel_ocr(Converter, samples_path):
Expand Down
3 changes: 0 additions & 3 deletions test/nodes/test_link_content_fetcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,6 @@ def test_init():
assert r.processor is None
assert isinstance(r.handlers, dict)
assert "text/html" in r.handlers
assert "application/pdf" in r.handlers


@pytest.mark.unit
Expand All @@ -49,7 +48,6 @@ def test_init_with_preprocessor():
assert r.processor == pre_processor_mock
assert isinstance(r.handlers, dict)
assert "text/html" in r.handlers
assert "application/pdf" in r.handlers


@pytest.mark.unit
Expand All @@ -65,7 +63,6 @@ def fake_but_valid_video_content_handler(response: Response) -> Optional[str]:

assert isinstance(r.handlers, dict)
assert "text/html" in r.handlers
assert "application/pdf" in r.handlers
assert "video/mp4" in r.handlers


Expand Down
2 changes: 1 addition & 1 deletion test/nodes/test_preprocessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
from transformers import AutoTokenizer, PreTrainedTokenizerBase

from haystack import Document
from haystack.nodes.file_converter.pdf import PDFToTextConverter
from haystack.nodes.file_converter.pdf_xpdf import PDFToTextConverter
from haystack.nodes.preprocessor.preprocessor import PreProcessor


Expand Down

0 comments on commit 62167b1

Please sign in to comment.