diff --git a/.github/actions/base-cache/action.yml b/.github/actions/base-cache/action.yml index 2aaf5497b5..05b1ddc227 100644 --- a/.github/actions/base-cache/action.yml +++ b/.github/actions/base-cache/action.yml @@ -30,7 +30,9 @@ runs: shell: bash run: | python${{ inputs.python-version }} -m pip install --upgrade virtualenv - python${{ inputs.python-version }} -m venv .venv + if [ ! -d ".venv" ]; then + python${{ inputs.python-version }} -m venv .venv + fi source .venv/bin/activate [ ! -d "$NLTK_DATA" ] && mkdir "$NLTK_DATA" if [ "${{ inputs.python-version == '3.12' }}" == "true" ]; then @@ -38,6 +40,7 @@ runs: python -m pip install --upgrade setuptools fi make install-ci + make install-nltk-models - name: Save Cache if: steps.virtualenv-cache-restore.outputs.cache-hit != 'true' id: virtualenv-cache-save diff --git a/.github/actions/base-ingest-cache/action.yml b/.github/actions/base-ingest-cache/action.yml index f29d867646..dc9d5105a2 100644 --- a/.github/actions/base-ingest-cache/action.yml +++ b/.github/actions/base-ingest-cache/action.yml @@ -18,7 +18,7 @@ runs: path: | .venv nltk_data - key: unstructured-ingest-${{ runner.os }}-${{ inputs.python-version }}-${{ hashFiles('requirements/ingest/*.txt') }}-${{ hashFiles('requirements/*.txt') }} + key: unstructured-ingest-${{ runner.os }}-${{ inputs.python-version }}-${{ hashFiles('requirements/ingest/*.txt', 'requirements/*.txt') }} lookup-only: ${{ inputs.check-only }} - name: Set up Python ${{ inputs.python-version }} if: steps.ingest-virtualenv-cache-restore.outputs.cache-hit != 'true' @@ -39,6 +39,8 @@ runs: python -m pip install --upgrade setuptools fi make install-ci + make install-nltk-models + make install-all-docs make install-ingest - name: Save Ingest Cache if: steps.ingest-virtualenv-cache-restore.outputs.cache-hit != 'true' @@ -48,5 +50,5 @@ runs: path: | .venv nltk_data - key: unstructured-ingest-${{ runner.os }}-${{ inputs.python-version }}-${{ hashFiles('requirements/ingest/*.txt') }}-${{ hashFiles('requirements/*.txt') }} + key: unstructured-ingest-${{ runner.os }}-${{ inputs.python-version }}-${{ hashFiles('requirements/ingest/*.txt', 'requirements/*.txt') }} diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 81afe54c52..88fe84680b 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -12,14 +12,15 @@ permissions: id-token: write contents: read +env: + NLTK_DATA: ${{ github.workspace }}/nltk_data + jobs: setup: strategy: matrix: python-version: ["3.9","3.10","3.11", "3.12"] runs-on: ubuntu-latest - env: - NLTK_DATA: ${{ github.workspace }}/nltk_data steps: - uses: actions/checkout@v4 - uses: ./.github/actions/base-cache @@ -78,8 +79,6 @@ jobs: strategy: matrix: python-version: ["3.9","3.10","3.11"] - env: - NLTK_DATA: ${{ github.workspace }}/nltk_data runs-on: ubuntu-latest needs: [setup, changelog] steps: @@ -185,8 +184,6 @@ jobs: python-version: ["3.10"] extra: ["csv", "docx", "odt", "markdown", "pypandoc", "pdf-image", "pptx", "xlsx"] runs-on: ubuntu-latest - env: - NLTK_DATA: ${{ github.workspace }}/nltk_data needs: [setup, lint, test_unit_no_extras] steps: - uses: actions/checkout@v4 @@ -220,6 +217,7 @@ jobs: sudo apt-get update sudo apt-get install -y tesseract-ocr tesseract-ocr-kor tesseract --version + make install-${{ matrix.extra }} make test-extra-${{ matrix.extra }} CI=true setup_ingest: @@ -227,8 +225,6 @@ jobs: matrix: python-version: [ "3.9","3.10" ] runs-on: ubuntu-latest - env: - NLTK_DATA: ${{ github.workspace }}/nltk_data needs: [setup] steps: - uses: actions/checkout@v4 @@ -307,7 +303,6 @@ jobs: MXBAI_API_KEY: ${{secrets.MXBAI_API_KEY}} OCR_AGENT: "unstructured.partition.utils.ocr_models.tesseract_ocr.OCRAgentTesseract" CI: "true" - NLTK_DATA: ${{ github.workspace }}/nltk_data PYTHON: python${{ matrix.python-version }} run: | source .venv/bin/activate @@ -320,6 +315,8 @@ jobs: sudo apt-get install -y tesseract-ocr-kor sudo apt-get install diffstat tesseract --version + make install-all-docs + make install-ingest ./test_unstructured_ingest/test-ingest-src.sh @@ -329,8 +326,6 @@ jobs: # NOTE(yuming): Unstructured API only use Python 3.10 python-version: ["3.10"] runs-on: ubuntu-latest - env: - NLTK_DATA: ${{ github.workspace }}/nltk_data needs: [setup, lint] steps: - uses: actions/checkout@v4 diff --git a/CHANGELOG.md b/CHANGELOG.md index 1be43eafdf..8b85d3a75e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,7 +1,8 @@ -## 0.16.1-dev5 +## 0.16.1-dev6 ### Enhancements +* **Bump `unstructured-inference` to 0.7.39** and upgrade other dependencies * **Round coordinates** Round coordinates when computing bounding box overlaps in `pdfminer_processing.py` to nearest machine precision. This can help reduce underterministic behavior from machine precision that affects which bounding boxes to combine. ### Features diff --git a/requirements/base.txt b/requirements/base.txt index 5ff129c06a..b4da37cc68 100644 --- a/requirements/base.txt +++ b/requirements/base.txt @@ -4,7 +4,7 @@ # # pip-compile ./base.in # -anyio==4.6.0 +anyio==4.6.2.post1 # via httpx backoff==2.2.1 # via -r ./base.in @@ -20,7 +20,7 @@ cffi==1.17.1 # via cryptography chardet==5.2.0 # via -r ./base.in -charset-normalizer==3.3.2 +charset-normalizer==3.4.0 # via # requests # unstructured-client @@ -28,7 +28,7 @@ click==8.1.7 # via # nltk # python-oxmsg -cryptography==43.0.1 +cryptography==43.0.3 # via unstructured-client dataclasses-json==0.6.7 # via @@ -62,7 +62,7 @@ langdetect==1.0.9 # via -r ./base.in lxml==5.3.0 # via -r ./base.in -marshmallow==3.22.0 +marshmallow==3.23.0 # via # dataclasses-json # unstructured-client @@ -84,7 +84,7 @@ packaging==24.1 # via # marshmallow # unstructured-client -psutil==6.0.0 +psutil==6.1.0 # via -r ./base.in pycparser==2.22 # via cffi diff --git a/requirements/dev.txt b/requirements/dev.txt index 3ce9e87d64..bd90364012 100644 --- a/requirements/dev.txt +++ b/requirements/dev.txt @@ -4,7 +4,7 @@ # # pip-compile ./dev.in # -build==1.2.2 +build==1.2.2.post1 # via pip-tools cfgv==3.4.0 # via pre-commit @@ -13,7 +13,7 @@ click==8.1.7 # -c ./base.txt # -c ./test.txt # pip-tools -distlib==0.3.8 +distlib==0.3.9 # via virtualenv filelock==3.16.1 # via virtualenv @@ -36,7 +36,7 @@ platformdirs==4.3.6 # via # -c ./test.txt # virtualenv -pre-commit==3.8.0 +pre-commit==4.0.1 # via -r ./dev.in pyproject-hooks==1.2.0 # via @@ -51,7 +51,7 @@ tomli==2.0.2 # -c ./test.txt # build # pip-tools -virtualenv==20.26.6 +virtualenv==20.27.0 # via pre-commit wheel==0.44.0 # via pip-tools diff --git a/requirements/extra-epub.txt b/requirements/extra-epub.txt index 6946095500..a9533059da 100644 --- a/requirements/extra-epub.txt +++ b/requirements/extra-epub.txt @@ -4,5 +4,5 @@ # # pip-compile ./extra-epub.in # -pypandoc==1.13 +pypandoc==1.14 # via -r ./extra-epub.in diff --git a/requirements/extra-odt.txt b/requirements/extra-odt.txt index 9451b480ca..28ebf301a6 100644 --- a/requirements/extra-odt.txt +++ b/requirements/extra-odt.txt @@ -8,7 +8,7 @@ lxml==5.3.0 # via # -c ./base.txt # python-docx -pypandoc==1.13 +pypandoc==1.14 # via -r ./extra-odt.in python-docx==1.1.2 # via -r ./extra-odt.in diff --git a/requirements/extra-paddleocr.txt b/requirements/extra-paddleocr.txt index db0079f9f6..c758ad209b 100644 --- a/requirements/extra-paddleocr.txt +++ b/requirements/extra-paddleocr.txt @@ -4,7 +4,7 @@ # # pip-compile ./extra-paddleocr.in # -anyio==4.6.0 +anyio==4.6.2.post1 # via # -c ./base.txt # httpx @@ -16,7 +16,7 @@ certifi==2024.8.30 # httpcore # httpx # requests -charset-normalizer==3.3.2 +charset-normalizer==3.4.0 # via # -c ./base.txt # requests @@ -52,7 +52,7 @@ idna==3.10 # anyio # httpx # requests -imageio==2.35.1 +imageio==2.36.0 # via # imgaug # scikit-image @@ -104,7 +104,7 @@ paddlepaddle==3.0.0b1 # via -r ./extra-paddleocr.in pdf2image==1.17.0 # via unstructured-paddleocr -pillow==10.4.0 +pillow==11.0.0 # via # imageio # imgaug @@ -117,9 +117,9 @@ protobuf==4.25.5 # via # -c ././deps/constraints.txt # paddlepaddle -pyclipper==1.3.0.post5 +pyclipper==1.3.0.post6 # via unstructured-paddleocr -pyparsing==3.1.4 +pyparsing==3.2.0 # via matplotlib python-dateutil==2.9.0.post0 # via diff --git a/requirements/extra-pandoc.txt b/requirements/extra-pandoc.txt index bde50c2ba5..4125059733 100644 --- a/requirements/extra-pandoc.txt +++ b/requirements/extra-pandoc.txt @@ -4,5 +4,5 @@ # # pip-compile ./extra-pandoc.in # -pypandoc==1.13 +pypandoc==1.14 # via -r ./extra-pandoc.in diff --git a/requirements/extra-pdf-image.in b/requirements/extra-pdf-image.in index f8a746d687..494f6dc4ff 100644 --- a/requirements/extra-pdf-image.in +++ b/requirements/extra-pdf-image.in @@ -11,5 +11,5 @@ google-cloud-vision effdet # Do not move to constraints.in, otherwise unstructured-inference will not be upgraded # when unstructured library is. -unstructured-inference==0.7.36 +unstructured-inference==0.8.0 unstructured.pytesseract>=0.3.12 diff --git a/requirements/extra-pdf-image.txt b/requirements/extra-pdf-image.txt index a7d3ce8cf1..0c88ff55d3 100644 --- a/requirements/extra-pdf-image.txt +++ b/requirements/extra-pdf-image.txt @@ -16,7 +16,7 @@ cffi==1.17.1 # via # -c ./base.txt # cryptography -charset-normalizer==3.3.2 +charset-normalizer==3.4.0 # via # -c ./base.txt # pdfminer-six @@ -25,7 +25,7 @@ coloredlogs==15.0.1 # via onnxruntime contourpy==1.3.0 # via matplotlib -cryptography==43.0.1 +cryptography==43.0.3 # via # -c ./base.txt # pdfminer-six @@ -48,7 +48,7 @@ fsspec==2024.9.0 # via # huggingface-hub # torch -google-api-core[grpc]==2.20.0 +google-api-core[grpc]==2.21.0 # via google-cloud-vision google-auth==2.35.0 # via @@ -60,14 +60,14 @@ googleapis-common-protos==1.65.0 # via # google-api-core # grpcio-status -grpcio==1.66.2 +grpcio==1.67.0 # via # -c ././deps/constraints.txt # google-api-core # grpcio-status grpcio-status==1.62.3 # via google-api-core -huggingface-hub==0.25.1 +huggingface-hub==0.26.0 # via # timm # tokenizers @@ -93,7 +93,7 @@ lxml==5.3.0 # via # -c ./base.txt # pikepdf -markupsafe==2.1.5 +markupsafe==3.0.2 # via jinja2 matplotlib==3.9.2 # via @@ -117,6 +117,7 @@ numpy==1.26.4 # scipy # torchvision # transformers + # unstructured-inference omegaconf==2.3.0 # via effdet onnx==1.17.0 @@ -150,11 +151,11 @@ pdfminer-six==20231228 # pdfplumber pdfplumber==0.11.4 # via layoutparser -pi-heif==0.18.0 +pi-heif==0.20.0 # via -r ./extra-pdf-image.in pikepdf==9.3.0 # via -r ./extra-pdf-image.in -pillow==10.4.0 +pillow==11.0.0 # via # layoutparser # matplotlib @@ -192,7 +193,7 @@ pycparser==2.22 # via # -c ./base.txt # cffi -pyparsing==3.1.4 +pyparsing==3.2.0 # via matplotlib pypdf==5.0.1 # via @@ -242,11 +243,11 @@ six==1.16.0 # via # -c ./base.txt # python-dateutil -sympy==1.13.3 +sympy==1.13.1 # via # onnxruntime # torch -timm==1.0.9 +timm==1.0.11 # via # effdet # unstructured-inference @@ -254,13 +255,13 @@ tokenizers==0.19.1 # via # -c ././deps/constraints.txt # transformers -torch==2.4.1 +torch==2.5.0 # via # effdet # timm # torchvision # unstructured-inference -torchvision==0.19.1 +torchvision==0.20.0 # via # effdet # timm @@ -281,7 +282,7 @@ typing-extensions==4.12.2 # torch tzdata==2024.2 # via pandas -unstructured-inference==0.7.36 +unstructured-inference==0.8.0 # via -r ./extra-pdf-image.in unstructured-pytesseract==0.3.13 # via -r ./extra-pdf-image.in diff --git a/requirements/extra-pptx.txt b/requirements/extra-pptx.txt index 18bbad32ea..87119d3047 100644 --- a/requirements/extra-pptx.txt +++ b/requirements/extra-pptx.txt @@ -6,7 +6,7 @@ # lxml==5.3.0 # via python-pptx -pillow==10.4.0 +pillow==11.0.0 # via python-pptx python-pptx==1.0.2 # via -r ./extra-pptx.in diff --git a/requirements/huggingface.txt b/requirements/huggingface.txt index 7b2e04bde3..5741ccdcd5 100644 --- a/requirements/huggingface.txt +++ b/requirements/huggingface.txt @@ -8,7 +8,7 @@ certifi==2024.8.30 # via # -c ./base.txt # requests -charset-normalizer==3.3.2 +charset-normalizer==3.4.0 # via # -c ./base.txt # requests @@ -25,7 +25,7 @@ fsspec==2024.9.0 # via # huggingface-hub # torch -huggingface-hub==0.25.1 +huggingface-hub==0.26.0 # via # tokenizers # transformers @@ -43,7 +43,7 @@ langdetect==1.0.9 # via # -c ./base.txt # -r ./huggingface.in -markupsafe==2.1.5 +markupsafe==3.0.2 # via jinja2 mpmath==1.3.0 # via sympy @@ -82,13 +82,13 @@ six==1.16.0 # via # -c ./base.txt # langdetect -sympy==1.13.3 +sympy==1.13.1 # via torch tokenizers==0.19.1 # via # -c ././deps/constraints.txt # transformers -torch==2.4.1 +torch==2.5.0 # via -r ./huggingface.in tqdm==4.66.5 # via diff --git a/requirements/test.txt b/requirements/test.txt index 6c9660091a..f368f4f5d3 100644 --- a/requirements/test.txt +++ b/requirements/test.txt @@ -6,7 +6,7 @@ # annotated-types==0.7.0 # via pydantic -anyio==4.6.0 +anyio==4.6.2.post1 # via # -c ./base.txt # httpx @@ -16,7 +16,7 @@ attrs==24.2.0 # via jsonschema autoflake==2.3.1 # via -r ./test.in -black==24.8.0 +black==24.10.0 # via -r ./test.in certifi==2024.8.30 # via @@ -24,7 +24,7 @@ certifi==2024.8.30 # httpcore # httpx # requests -charset-normalizer==3.3.2 +charset-normalizer==3.4.0 # via # -c ./base.txt # requests @@ -33,7 +33,7 @@ click==8.1.7 # -c ./base.txt # black # nltk -coverage[toml]==7.6.1 +coverage[toml]==7.6.4 # via # -r ./test.in # pytest-cov @@ -50,7 +50,7 @@ flake8-print==5.0.0 # via -r ./test.in freezegun==1.5.1 # via -r ./test.in -grpcio==1.66.2 +grpcio==1.67.0 # via # -c ././deps/constraints.txt # -r ./test.in @@ -95,7 +95,7 @@ mccabe==0.7.0 # via flake8 multidict==6.1.0 # via yarl -mypy==1.11.2 +mypy==1.12.1 # via -r ./test.in mypy-extensions==1.0.0 # via @@ -119,12 +119,14 @@ pandas==2.2.3 # via label-studio-sdk pathspec==0.12.1 # via black -pillow==10.4.0 +pillow==11.0.0 # via label-studio-sdk platformdirs==4.3.6 # via black pluggy==1.5.0 # via pytest +propcache==0.2.0 + # via yarl pycodestyle==2.12.1 # via # flake8 @@ -226,7 +228,7 @@ urllib3==1.26.20 # -c ./base.txt # requests # vcrpy -vcrpy==6.0.1 +vcrpy==6.0.2 # via -r ./test.in wrapt==1.16.0 # via @@ -234,7 +236,7 @@ wrapt==1.16.0 # vcrpy xmljson==0.2.1 # via label-studio-sdk -yarl==1.13.1 +yarl==1.15.5 # via vcrpy # The following packages are considered to be unsafe in a requirements file: diff --git a/test_unstructured/partition/pdf_image/test_inference_utils.py b/test_unstructured/partition/pdf_image/test_inference_utils.py index 085377f189..1000b4bad1 100644 --- a/test_unstructured/partition/pdf_image/test_inference_utils.py +++ b/test_unstructured/partition/pdf_image/test_inference_utils.py @@ -1,4 +1,4 @@ -from unstructured_inference.inference.elements import TextRegion +from unstructured_inference.inference.elements import TextRegion, TextRegions from unstructured_inference.inference.layoutelement import LayoutElement from unstructured.documents.elements import ElementType @@ -17,7 +17,7 @@ def test_merge_text_regions(mock_embedded_text_regions): text="LayoutParser: A Unified Toolkit for Deep Learning Based Document Image", ) - merged_text_region = merge_text_regions(mock_embedded_text_regions) + merged_text_region = merge_text_regions(TextRegions.from_list(mock_embedded_text_regions)) assert merged_text_region == expected diff --git a/test_unstructured/partition/pdf_image/test_pdf.py b/test_unstructured/partition/pdf_image/test_pdf.py index 4362f06bbd..ac780caf7f 100644 --- a/test_unstructured/partition/pdf_image/test_pdf.py +++ b/test_unstructured/partition/pdf_image/test_pdf.py @@ -179,6 +179,12 @@ def _test(result): # check that the pdf has multiple different page numbers assert {element.metadata.page_number for element in result} == expected_page_numbers if UNSTRUCTURED_INCLUDE_DEBUG_METADATA: + print( + [ + (element.metadata.detection_origin, element.category, element.text) + for element in result + ] + ) assert {element.metadata.detection_origin for element in result} == origin if file_mode == "filename": diff --git a/test_unstructured/partition/test_api.py b/test_unstructured/partition/test_api.py index f95dd78595..93f36e2e15 100644 --- a/test_unstructured/partition/test_api.py +++ b/test_unstructured/partition/test_api.py @@ -19,8 +19,8 @@ DIRECTORY = pathlib.Path(__file__).parent.resolve() -# NOTE(crag): point to freemium API for now -API_URL = "https://api.unstructured.io/general/v0/general" +# NOTE(yao): point to paid API for now +API_URL = "https://api.unstructuredapp.io/general/v0/general" is_in_ci = os.getenv("CI", "").lower() not in {"", "false", "f", "0"} skip_not_on_main = os.getenv("GITHUB_REF_NAME", "").lower() != "main" diff --git a/test_unstructured_ingest/expected-structured-output/biomed-api/65/11/main.PMC6312790.pdf.json b/test_unstructured_ingest/expected-structured-output/biomed-api/65/11/main.PMC6312790.pdf.json index 164e9cfa2f..484b099f94 100644 --- a/test_unstructured_ingest/expected-structured-output/biomed-api/65/11/main.PMC6312790.pdf.json +++ b/test_unstructured_ingest/expected-structured-output/biomed-api/65/11/main.PMC6312790.pdf.json @@ -338,7 +338,20 @@ "type": "ListItem" }, { - "element_id": "6277cd91869e10d6256f362b08d3e789", + "element_id": "f0f0586caeb3af4284c1b367a5269d27", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "languages": [ + "eng" + ], + "page_number": 2 + }, + "text": "452", + "type": "Header" + }, + { + "element_id": "ac79570be092923eb29899f64281c3b3", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -351,7 +364,7 @@ "type": "Table" }, { - "element_id": "22b8448fe36b3ccd06d1d8e4ea2dc1ea", + "element_id": "13fd694e1ff862d163b840a246964e58", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -364,7 +377,7 @@ "type": "Title" }, { - "element_id": "f2b57562924402b85f6eb07925ea1654", + "element_id": "5f1c4074c1b5d641b724b99be6f5ddfd", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -377,7 +390,7 @@ "type": "NarrativeText" }, { - "element_id": "d9f6efffd49ef59e671206bfb5f094de", + "element_id": "afed004de4c50d761640b6c18729a988", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -390,7 +403,7 @@ "type": "ListItem" }, { - "element_id": "2a1e46bc589c5eca777b657e141e824b", + "element_id": "f93d89ccb971e2b60f44afbf710673c6", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -403,7 +416,7 @@ "type": "NarrativeText" }, { - "element_id": "2c42182c07ecdb96362b534a8fad4d59", + "element_id": "cb6e8acb9c24820b59f8973cc236ef35", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -416,7 +429,7 @@ "type": "ListItem" }, { - "element_id": "c6fd85f9219a2c75bb1f8c1889bb2b5f", + "element_id": "5964ede27be8850de7a13e0dd32c1b21", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -429,7 +442,7 @@ "type": "NarrativeText" }, { - "element_id": "07cdb1623f501ea23a343039300178cc", + "element_id": "e1f7e635d8739a97d8d0000ba8004f61", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -442,7 +455,7 @@ "type": "ListItem" }, { - "element_id": "4bf8165bcb21c5296b741ba0f9e38f93", + "element_id": "deb8964830ba1f9dd1eec7b08bd3ea19", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -455,7 +468,7 @@ "type": "Title" }, { - "element_id": "85918ce2a03e9f236137a0fe72985af0", + "element_id": "be270e13c935334fa3b17b13066d639b", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -468,7 +481,7 @@ "type": "NarrativeText" }, { - "element_id": "93537983496efa695cfc65ad895d9412", + "element_id": "5c97405ec921495b23d2b400516cbd06", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -481,7 +494,7 @@ "type": "Image" }, { - "element_id": "76b94e78b638b79374e266284c1a0d83", + "element_id": "7956ee39ac5e080a362967e2f6a5753e", "metadata": { "data_source": {}, "filetype": "application/pdf", diff --git a/test_unstructured_ingest/expected-structured-output/biomed-api/75/29/main.PMC6312793.pdf.json b/test_unstructured_ingest/expected-structured-output/biomed-api/75/29/main.PMC6312793.pdf.json index 64c57d6dfc..5a25c95e60 100644 --- a/test_unstructured_ingest/expected-structured-output/biomed-api/75/29/main.PMC6312793.pdf.json +++ b/test_unstructured_ingest/expected-structured-output/biomed-api/75/29/main.PMC6312793.pdf.json @@ -598,20 +598,7 @@ "type": "NarrativeText" }, { - "element_id": "448de3300a8c7e2cfdd2028dd0bb4171", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "languages": [ - "eng" - ], - "page_number": 2 - }, - "text": "and", - "type": "NarrativeText" - }, - { - "element_id": "b13807f59ac7c6647ee0aee74f9b0dd3", + "element_id": "db6ff60cbdb77adc14a6b9491af8d161", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -624,7 +611,7 @@ "type": "ListItem" }, { - "element_id": "db480e847a5703b19be6b79223e1ee03", + "element_id": "9f6ef223a141a5381951eff39b3af039", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -637,7 +624,7 @@ "type": "NarrativeText" }, { - "element_id": "326c44638a881f86474b82cc244896f9", + "element_id": "5c67842128e14fc16344beaa2aa0111e", "metadata": { "data_source": {}, "filetype": "application/pdf", diff --git a/test_unstructured_ingest/expected-structured-output/local-single-file-with-pdf-infer-table-structure/layout-parser-paper.pdf.json b/test_unstructured_ingest/expected-structured-output/local-single-file-with-pdf-infer-table-structure/layout-parser-paper.pdf.json index 8c3c0f6ae6..66e1dbea73 100644 --- a/test_unstructured_ingest/expected-structured-output/local-single-file-with-pdf-infer-table-structure/layout-parser-paper.pdf.json +++ b/test_unstructured_ingest/expected-structured-output/local-single-file-with-pdf-infer-table-structure/layout-parser-paper.pdf.json @@ -1276,9 +1276,75 @@ } } }, + { + "type": "ListItem", + "element_id": "53b448c75f1556b1f60b4e3324bd0724", + "text": "1 import layoutparser as lp", + "metadata": { + "filetype": "application/pdf", + "languages": [ + "eng" + ], + "page_number": 5, + "data_source": { + "record_locator": { + "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/example-docs/layout-parser-paper.pdf" + }, + "permissions_data": [ + { + "mode": 33188 + } + ] + } + } + }, + { + "type": "ListItem", + "element_id": "a002e13c7ea2613b2eabb9ea3501856d", + "text": "3 model = lp . De t e c tro n2 Lay outM odel (", + "metadata": { + "filetype": "application/pdf", + "languages": [ + "eng" + ], + "page_number": 5, + "data_source": { + "record_locator": { + "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/example-docs/layout-parser-paper.pdf" + }, + "permissions_data": [ + { + "mode": 33188 + } + ] + } + } + }, + { + "type": "ListItem", + "element_id": "366c05fd7babc86bf01d690b9df755da", + "text": "5 layout = model . detect ( image )", + "metadata": { + "filetype": "application/pdf", + "languages": [ + "eng" + ], + "page_number": 5, + "data_source": { + "record_locator": { + "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/example-docs/layout-parser-paper.pdf" + }, + "permissions_data": [ + { + "mode": 33188 + } + ] + } + } + }, { "type": "NarrativeText", - "element_id": "59171bb0b4a32c9ec1b0e1d327ddb88f", + "element_id": "f888c5e8f5b1339f2af75612ea13c719", "text": "LayoutParser provides a wealth of pre-trained model weights using various datasets covering di\ufb00erent languages, time periods, and document types. Due to domain shift [7], the prediction performance can notably drop when models are ap- plied to target samples that are signi\ufb01cantly di\ufb00erent from the training dataset. As document structures and layouts vary greatly in di\ufb00erent domains, it is important to select models trained on a dataset similar to the test samples. A semantic syntax is used for initializing the model weights in LayoutParser, using both the dataset name and model name lp:///.", "metadata": { "filetype": "application/pdf", diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 038be7ea70..6ce43d5c79 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.16.1-dev5" # pragma: no cover +__version__ = "0.16.1-dev6" # pragma: no cover diff --git a/unstructured/partition/pdf_image/inference_utils.py b/unstructured/partition/pdf_image/inference_utils.py index 6fdd4c05cf..7218eb93b9 100644 --- a/unstructured/partition/pdf_image/inference_utils.py +++ b/unstructured/partition/pdf_image/inference_utils.py @@ -3,7 +3,7 @@ from typing import TYPE_CHECKING, Optional from unstructured_inference.constants import Source -from unstructured_inference.inference.elements import TextRegion +from unstructured_inference.inference.elements import TextRegion, TextRegions from unstructured_inference.inference.layoutelement import ( LayoutElement, partition_groups_from_regions, @@ -66,9 +66,9 @@ def build_layout_elements_from_ocr_regions( for r in regions: ocr_regions.remove(r) - grouped_regions.append(regions) + grouped_regions.append(TextRegions.from_list(regions)) else: - grouped_regions = partition_groups_from_regions(ocr_regions) + grouped_regions = partition_groups_from_regions(TextRegions.from_list(ocr_regions)) merged_regions = [merge_text_regions(group) for group in grouped_regions] return [ @@ -79,12 +79,12 @@ def build_layout_elements_from_ocr_regions( ] -def merge_text_regions(regions: list[TextRegion]) -> TextRegion: +def merge_text_regions(regions: TextRegions) -> TextRegion: """ Merge a list of TextRegion objects into a single TextRegion. Parameters: - - group (list[TextRegion]): A list of TextRegion objects to be merged. + - group (TextRegions): A group of TextRegion objects to be merged. Returns: - TextRegion: A single merged TextRegion object. @@ -93,13 +93,12 @@ def merge_text_regions(regions: list[TextRegion]) -> TextRegion: if not regions: raise ValueError("The text regions to be merged must be provided.") - min_x1 = min([tr.bbox.x1 for tr in regions]) - min_y1 = min([tr.bbox.y1 for tr in regions]) - max_x2 = max([tr.bbox.x2 for tr in regions]) - max_y2 = max([tr.bbox.y2 for tr in regions]) + min_x1 = regions.x1.min().astype(float) + min_y1 = regions.y1.min().astype(float) + max_x2 = regions.x2.max().astype(float) + max_y2 = regions.y2.max().astype(float) - merged_text = " ".join([tr.text for tr in regions if tr.text]) - sources = [tr.source for tr in regions] - source = sources[0] if all(s == sources[0] for s in sources) else None + merged_text = " ".join([text for text in regions.texts if text]) + source = regions.source return TextRegion.from_coords(min_x1, min_y1, max_x2, max_y2, merged_text, source)