Skip to content

Commit

Permalink
fix: Strategy heuristic test & fix (#203)
Browse files Browse the repository at this point in the history
* fix: add parse tests for every supported extensions

* add: each parser has supported FileExtensions

* fix: ValueError for unsupported extensions

* feat: add heuristic test

* fix: strategy

* add: native tests

* add: ReadMe mention SAFEDOCS as a source
  • Loading branch information
chloedia authored Dec 23, 2024
1 parent 9dff0de commit 7b7fb40
Show file tree
Hide file tree
Showing 16 changed files with 24 additions and 6 deletions.
2 changes: 2 additions & 0 deletions libs/megaparse/README.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
# MegaParse CORE

- Core package of megaparse

> **Note:** The test files in `tests/pdf/ocr` and `tests/pdf/native` come from SAFEDOCS (CC-MAIN-2021-31-PDF-UNTRUNCATED). You can find more information [here](https://digitalcorpora.org/corpora/file-corpora/cc-main-2021-31-pdf-untruncated/).
4 changes: 3 additions & 1 deletion libs/megaparse/src/megaparse/examples/parse_file.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,14 @@
from megaparse import MegaParse
from megaparse.parser.unstructured_parser import UnstructuredParser
import pypdfium2 as pdfium


def main():
parser = UnstructuredParser()
megaparse = MegaParse(parser=parser)

file_path = "somewhere/only_pdfs/4 The Language of Medicine 2024.07.21.pdf"
file_path = "./tests/pdf/ocr/0168123.pdf"

parsed_file = megaparse.load(file_path)
print(f"\n----- File Response : {file_path} -----\n")
print(parsed_file)
Expand Down
1 change: 1 addition & 0 deletions libs/megaparse/src/megaparse/megaparse.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,7 @@ def _select_parser(
file: BinaryIO | None = None,
file_extension: str | FileExtension = "",
) -> BaseParser:
local_strategy = None
if self.strategy != StrategyEnum.AUTO or file_extension != FileExtension.PDF:
return self.parser
if file:
Expand Down
3 changes: 3 additions & 0 deletions libs/megaparse/src/megaparse/parser/strategy.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from megaparse_sdk.schema.parser_config import StrategyEnum
from pypdfium2._helpers.page import PdfPage
from pypdfium2._helpers.pageobjects import PdfImage
from pypdfium2._helpers.textpage import PdfTextPage

logger = logging.getLogger("megaparse")

Expand All @@ -19,6 +20,8 @@ def get_strategy_page(page: PdfPage, threshold_image_page: float) -> StrategyEnu
for obj in page.get_objects():
if isinstance(obj, PdfImage):
images_coords.append(obj.get_pos())
elif obj.type == 2:
images_coords.append(obj.get_pos())

canva = np.zeros((int(page.get_height()), int(page.get_width())))
for coords in images_coords:
Expand Down
Binary file added libs/megaparse/tests/pdf/native/0168004.pdf
Binary file not shown.
Binary file added libs/megaparse/tests/pdf/native/0168011.pdf
Binary file not shown.
Binary file added libs/megaparse/tests/pdf/native/0168014.pdf
Binary file not shown.
Binary file added libs/megaparse/tests/pdf/ocr/0168119.pdf
Binary file not shown.
Binary file added libs/megaparse/tests/pdf/ocr/0168120.pdf
Binary file not shown.
Binary file added libs/megaparse/tests/pdf/ocr/0168123.pdf
Binary file not shown.
Binary file added libs/megaparse/tests/pdf/ocr/0168126.pdf
Binary file not shown.
Binary file added libs/megaparse/tests/pdf/ocr/0168127.pdf
Binary file not shown.
Binary file added libs/megaparse/tests/pdf/ocr/0168322.pdf
Binary file not shown.
Binary file not shown.
20 changes: 15 additions & 5 deletions libs/megaparse/tests/pdf/test_detect_ocr.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,20 @@
import os

import pytest
from megaparse.parser.strategy import determine_strategy
from megaparse_sdk.schema.parser_config import StrategyEnum

ocr_pdfs = os.listdir("./tests/pdf/ocr")
native_pdfs = os.listdir("./tests/pdf/native")


def test_strategy_all():
pdf = "./tests/pdf/sample_pdf.pdf"
strategy = determine_strategy(
pdf, threshold_pages_ocr=0.2, threshold_image_page=0.3
)
@pytest.mark.parametrize("hi_res_pdf", ocr_pdfs)
def test_hi_res_strategy(hi_res_pdf):
strategy = determine_strategy(f"./tests/pdf/ocr/{hi_res_pdf}")
assert strategy == StrategyEnum.HI_RES


@pytest.mark.parametrize("native_pdf", native_pdfs)
def test_fast_strategy(native_pdf):
strategy = determine_strategy(f"./tests/pdf/native/{native_pdf}")
assert strategy == StrategyEnum.FAST
Binary file added libs/megaparse/tests/pdf/undefined/0168003.pdf
Binary file not shown.

0 comments on commit 7b7fb40

Please sign in to comment.