Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

chore: remove PyMuPDF dependency #7658

Merged
merged 8 commits into from
May 7, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 3 additions & 4 deletions .github/workflows/license_compliance.yml
Original file line number Diff line number Diff line change
Expand Up @@ -42,10 +42,9 @@ jobs:
# Exclusions in the vanilla distribution must be explicitly motivated
#
# - tqdm is MLP but there are no better alternatives
# - PyMuPDF is optional
# - pinecone-client is optional
# - psycopg2 is optional
exclude: "(?i)^(PyMuPDF|tqdm|pinecone-client|psycopg2).*"
exclude: "(?i)^(tqdm|pinecone-client|psycopg2).*"

# We keep the license inventory on FOSSA
- name: Send license report to Fossa
Expand Down Expand Up @@ -199,7 +198,7 @@ jobs:
# Special cases:
# - pyzmq is flagged because dual-licensed, but we assume using BSD
# - tqdm is MLP but there are no better alternatives
exclude: "(?i)^(astroid|certifi|chardet|num2words|nvidia-|pathspec|pinecone-client|psycopg2|pylint|PyMuPDF|pyzmq|tqdm).*"
exclude: "(?i)^(astroid|certifi|chardet|num2words|nvidia-|pathspec|pinecone-client|psycopg2|pylint|pyzmq|tqdm).*"

- name: Print report
if: ${{ always() }}
Expand Down Expand Up @@ -272,7 +271,7 @@ jobs:
# Special cases:
# - pyzmq is flagged because dual-licensed, but we assume using BSD
# - tqdm is MLP but there are no better alternatives
exclude: "(?i)^(astroid|certifi|chardet|num2words|nvidia-|pathspec|pinecone-client|psycopg2|pylint|PyMuPDF|pyzmq|tqdm).*"
exclude: "(?i)^(astroid|certifi|chardet|num2words|nvidia-|pathspec|pinecone-client|psycopg2|pylint|pyzmq|tqdm).*"

- name: Print report
if: ${{ always() }}
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/release_notes.yml
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ jobs:
if: steps.changed-files.outputs.any_changed == 'false' && !contains( github.event.pull_request.labels.*.name, 'ignore-for-release-notes')
run: |
# Check if any of the commit messages contain tags ci/docs/test
if git log --pretty=%s origin/main..HEAD | grep -E '^(ci:|docs:|test:)' > /dev/null; then
if git log --pretty=%s origin/v1.x..HEAD | grep -E '^(ci:|docs:|test:)' > /dev/null; then
echo "Skipping release note check for commits with 'ci:', 'docs:', or 'test:' tags."
else
echo "::error::The release notes file is missing, please add one or attach the label 'ignore-for-release-notes' to this PR."
Expand Down
12 changes: 1 addition & 11 deletions haystack/nodes/file_converter/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,16 +9,6 @@
from haystack.nodes.file_converter.txt import TextConverter
from haystack.nodes.file_converter.azure import AzureConverter
from haystack.nodes.file_converter.parsr import ParsrConverter


try:
with LazyImport() as fitz_import:
# Try to use PyMuPDF, if not available fall back to xpdf
from haystack.nodes.file_converter.pdf import PDFToTextConverter # type: ignore

fitz_import.check()
except (ModuleNotFoundError, ImportError):
from haystack.nodes.file_converter.pdf_xpdf import PDFToTextConverter # type: ignore # pylint: disable=reimported,ungrouped-imports

from haystack.nodes.file_converter.pdf_xpdf import PDFToTextConverter
from haystack.nodes.file_converter.markdown import MarkdownConverter
from haystack.nodes.file_converter.image import ImageToTextConverter
307 changes: 0 additions & 307 deletions haystack/nodes/file_converter/pdf.py

This file was deleted.

21 changes: 0 additions & 21 deletions haystack/nodes/retriever/link_content.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import inspect
import io
import logging
from collections import defaultdict
from datetime import datetime
Expand All @@ -14,15 +13,11 @@
from tenacity import retry, stop_after_attempt, wait_exponential, retry_if_exception_type, RetryCallState

from haystack import __version__
from haystack.lazy_imports import LazyImport
from haystack.nodes import PreProcessor, BaseComponent
from haystack.schema import Document, MultiLabel

logger = logging.getLogger(__name__)

with LazyImport("Run 'pip install farm-haystack[pdf]'") as fitz_import:
import fitz


def html_content_handler(response: Response) -> Optional[str]:
"""
Expand All @@ -34,20 +29,6 @@ def html_content_handler(response: Response) -> Optional[str]:
return extractor.get_content(response.text)


def pdf_content_handler(response: Response) -> Optional[str]:
"""
Extracts text from PDF response stream using the PyMuPDF library.

:param response: Response object from the request.
:return: The extracted text.
"""
file_path = io.BytesIO(response.content)
with fitz.open(stream=file_path, filetype="pdf") as doc:
text = "\f".join([page.get_text() for page in doc])

return text.encode("ascii", errors="ignore").decode()


class LinkContentFetcher(BaseComponent):
"""
LinkContentFetcher fetches content from a URL and converts it into a list of Document objects.
Expand Down Expand Up @@ -153,8 +134,6 @@ def __init__(

# register default content handlers
self._register_content_handler("text/html", html_content_handler)
if fitz_import.is_successful():
self._register_content_handler("application/pdf", pdf_content_handler)

# register custom content handlers, can override default handlers
if content_handlers:
Expand Down
4 changes: 1 addition & 3 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -161,9 +161,7 @@ file-conversion = [
"python-magic; platform_system != 'Windows'", # Depends on libmagic: https://pypi.org/project/python-magic/
"python-magic-bin; platform_system == 'Windows'", # Needs to be installed without python-magic, otherwise Windows CI gets stuck.
]
pdf = [
"PyMuPDF>=1.18.16" , # PDF text extraction alternative to xpdf; please check AGPLv3 license
]
pdf = []
ocr = [
"pytesseract>0.3.7",
"pdf2image>1.14",
Expand Down
7 changes: 7 additions & 0 deletions releasenotes/notes/remove-pymupdf-15fc66b581538adb.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
---
prelude: >
This release changes the `PDFToTextConverter` so that it doesn't support PyMuPDF anymore.
The converter will always assume `xpdf` is used by default.
upgrade:
- |
To keep using PyMuPDF you must create a custom node, you can use the previous Haystack version for inspiration.
119 changes: 0 additions & 119 deletions test/nodes/test_file_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,125 +47,6 @@ def test_convert(Converter, samples_path):
assert "Adobe Systems made the PDF specification available free of charge in 1993." in page_standard_whitespace


@pytest.mark.unit
@pytest.mark.parametrize("Converter", [PDFToTextConverter])
def test_pdf_command_whitespaces(Converter, samples_path):
converter = Converter()

document = converter.run(file_paths=samples_path / "pdf" / "sample pdf file with spaces on file name.pdf")[0][
"documents"
][0]
assert "ɪ" in document.content


@pytest.mark.unit
@pytest.mark.parametrize("Converter", [PDFToTextConverter])
def test_pdf_encoding(Converter, samples_path):
converter = Converter()

document = converter.run(file_paths=samples_path / "pdf" / "sample_pdf_5.pdf")[0]["documents"][0]
assert "Ж" in document.content

document = converter.run(file_paths=samples_path / "pdf" / "sample_pdf_2.pdf")[0]["documents"][0]
assert "ɪ" in document.content


@pytest.mark.unit
@pytest.mark.parametrize("Converter", [PDFToTextConverter])
def test_pdf_sort_by_position(Converter, samples_path):
converter = Converter(sort_by_position=True)

document = converter.convert(file_path=samples_path / "pdf" / "sample_pdf_3.pdf")[0]
assert str(document.content).startswith("This is the second test sentence.")


@pytest.mark.unit
@pytest.mark.parametrize("Converter", [PDFToTextConverter])
def test_pdf_ligatures(Converter, samples_path):
converter = Converter()

document = converter.run(file_paths=samples_path / "pdf" / "sample_pdf_2.pdf")[0]["documents"][0]
assert "ff" not in document.content
assert "ɪ" in document.content

document = converter.run(file_paths=samples_path / "pdf" / "sample_pdf_2.pdf", known_ligatures={})[0]["documents"][
0
]
assert "ff" in document.content
assert "ɪ" in document.content

document = converter.run(file_paths=samples_path / "pdf" / "sample_pdf_2.pdf", known_ligatures={"ɪ": "i"})[0][
"documents"
][0]
assert "ff" in document.content
assert "ɪ" not in document.content


@pytest.mark.unit
@pytest.mark.parametrize("Converter", [PDFToTextConverter])
def test_pdf_page_range(Converter, samples_path):
converter = Converter()
document = converter.convert(file_path=samples_path / "pdf" / "sample_pdf_1.pdf", start_page=2)[0]
pages = document.content.split("\f")

assert (
len(pages) == 4
) # the sample PDF file has four pages, we skipped first (but we wanna correct number of pages)
assert pages[0] == "" # the page 1 was skipped.
assert pages[1] != "" # the page 2 is not empty.
assert pages[2] == "" # the page 3 is empty.


@pytest.mark.unit
@pytest.mark.parametrize("Converter", [PDFToTextConverter])
def test_pdf_page_range_numbers(Converter, samples_path):
converter = Converter()
document = converter.convert(file_path=samples_path / "pdf" / "sample_pdf_1.pdf", start_page=2)[0]

preprocessor = PreProcessor(
split_by="word", split_length=5, split_overlap=0, split_respect_sentence_boundary=False, add_page_number=True
)
documents = preprocessor.process([document])

assert documents[1].meta["page"] == 4


@pytest.mark.unit
@pytest.mark.parametrize("Converter", [PDFToTextConverter])
def test_pdf_parallel(Converter, samples_path):
converter = Converter(multiprocessing=True)
document = converter.convert(file_path=samples_path / "pdf" / "sample_pdf_6.pdf")[0]

pages = document.content.split("\f")

assert pages[0] == "This is the page 1 of the document."
assert pages[-1] == "This is the page 50 of the document."


@pytest.mark.unit
@pytest.mark.parametrize("Converter", [PDFToTextConverter])
def test_pdf_parallel_page_range(Converter, samples_path):
converter = Converter(multiprocessing=True)
document = converter.convert(file_path=samples_path / "pdf" / "sample_pdf_6.pdf", start_page=2)[0]

pages = document.content.split("\f")

assert pages[0] == ""
assert len(pages) == 50


@pytest.mark.unit
@pytest.mark.parametrize("Converter", [PDFToTextConverter])
def test_pdf_parallel_sort_by_position(Converter, samples_path):
converter = Converter(multiprocessing=True, sort_by_position=True)
document = converter.convert(file_path=samples_path / "pdf" / "sample_pdf_6.pdf")[0]

pages = document.content.split("\f")

assert pages[0] == "This is the page 1 of the document."
assert pages[-1] == "This is the page 50 of the document."


@pytest.mark.integration
@pytest.mark.parametrize("Converter", [PDFToTextConverter])
def test_pdf_parallel_ocr(Converter, samples_path):
Expand Down
3 changes: 0 additions & 3 deletions test/nodes/test_link_content_fetcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,6 @@ def test_init():
assert r.processor is None
assert isinstance(r.handlers, dict)
assert "text/html" in r.handlers
assert "application/pdf" in r.handlers


@pytest.mark.unit
Expand All @@ -49,7 +48,6 @@ def test_init_with_preprocessor():
assert r.processor == pre_processor_mock
assert isinstance(r.handlers, dict)
assert "text/html" in r.handlers
assert "application/pdf" in r.handlers


@pytest.mark.unit
Expand All @@ -65,7 +63,6 @@ def fake_but_valid_video_content_handler(response: Response) -> Optional[str]:

assert isinstance(r.handlers, dict)
assert "text/html" in r.handlers
assert "application/pdf" in r.handlers
assert "video/mp4" in r.handlers


Expand Down
2 changes: 1 addition & 1 deletion test/nodes/test_preprocessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
from transformers import AutoTokenizer, PreTrainedTokenizerBase

from haystack import Document
from haystack.nodes.file_converter.pdf import PDFToTextConverter
from haystack.nodes.file_converter.pdf_xpdf import PDFToTextConverter
from haystack.nodes.preprocessor.preprocessor import PreProcessor


Expand Down
Loading