From 6ba8135bf95ecdbd1733a84c12f7cdbddf4f37ad Mon Sep 17 00:00:00 2001 From: Matt Robinson Date: Fri, 30 Aug 2024 15:12:46 -0400 Subject: [PATCH] fix: check ole storage content to differentiate filetypes (#3581) ### Summary Updates the file detection logic for OLE files to check the storage content of the file to more reliable differentiate between DOC, PPT, XLS and MSG files. This corrects a bug that caused file type detection to be incorrect in cases where the `filetype` library guessed and incorrect MIME type, such as `'application/vnd.ms-excel'` for a `.msg` file. As part of this work, the `"msg"` extra was removed because the `python-oxmsg` package is now a base dependency. ### Testing Using a test `.msg` file that returns `'application/vnd.ms-excel'` from `filetype.guess_mime`. ```python from unstructured.file_utils.filetype import detect_filetype filename = "test-file.msg" detect_filetype(filename=filename) # result should be FileType.MSG ``` --- .github/workflows/ci.yml | 2 +- CHANGELOG.md | 3 +- Makefile | 12 +--- requirements/base.in | 1 + requirements/base.txt | 13 ++++- requirements/dev.txt | 6 +- requirements/extra-msg.in | 4 -- requirements/extra-msg.txt | 18 ------ requirements/extra-paddleocr.txt | 4 +- requirements/extra-pdf-image.txt | 4 +- requirements/huggingface.txt | 4 +- requirements/ingest/airtable.txt | 4 +- requirements/ingest/astradb.txt | 4 +- .../ingest/azure-cognitive-search.txt | 4 +- requirements/ingest/azure.txt | 4 +- requirements/ingest/box.txt | 4 +- requirements/ingest/chroma.txt | 4 +- requirements/ingest/clarifai.txt | 4 +- requirements/ingest/confluence.txt | 6 +- requirements/ingest/databricks-volumes.txt | 4 +- requirements/ingest/dropbox.txt | 4 +- requirements/ingest/elasticsearch.txt | 4 +- requirements/ingest/embed-aws-bedrock.txt | 6 +- requirements/ingest/embed-huggingface.txt | 6 +- requirements/ingest/embed-mixedbreadai.txt | 2 +- requirements/ingest/embed-octoai.txt | 6 +- requirements/ingest/embed-openai.txt | 8 +-- requirements/ingest/embed-vertexai.txt | 6 +- requirements/ingest/embed-voyageai.txt | 6 +- requirements/ingest/gcs.txt | 4 +- requirements/ingest/github.txt | 4 +- requirements/ingest/gitlab.txt | 4 +- requirements/ingest/google-drive.txt | 4 +- requirements/ingest/hubspot.txt | 4 +- requirements/ingest/jira.txt | 6 +- requirements/ingest/notion.txt | 2 +- requirements/ingest/onedrive.txt | 4 +- requirements/ingest/opensearch.txt | 4 +- requirements/ingest/outlook.txt | 4 +- requirements/ingest/pinecone.txt | 4 +- requirements/ingest/qdrant.txt | 4 +- requirements/ingest/reddit.txt | 4 +- requirements/ingest/s3.txt | 2 +- requirements/ingest/salesforce.txt | 4 +- requirements/ingest/sharepoint.txt | 4 +- requirements/ingest/singlestore.txt | 4 +- requirements/ingest/weaviate.txt | 4 +- requirements/ingest/wikipedia.txt | 4 +- requirements/test.txt | 4 +- setup.py | 3 - test_unstructured/file_utils/test_filetype.py | 56 +++++++++++++------ unstructured/__version__.py | 2 +- unstructured/file_utils/filetype.py | 24 ++++++++ 53 files changed, 171 insertions(+), 149 deletions(-) delete mode 100644 requirements/extra-msg.in delete mode 100644 requirements/extra-msg.txt diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 491b817f94..30aa33b2d4 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -215,7 +215,7 @@ jobs: strategy: matrix: python-version: ["3.10"] - extra: ["csv", "docx", "odt", "markdown", "pypandoc", "msg", "pdf-image", "pptx", "xlsx"] + extra: ["csv", "docx", "odt", "markdown", "pypandoc", "pdf-image", "pptx", "xlsx"] runs-on: ubuntu-latest env: NLTK_DATA: ${{ github.workspace }}/nltk_data diff --git a/CHANGELOG.md b/CHANGELOG.md index d06493d428..7dfddbd181 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,4 @@ -## 0.15.9-dev1 +## 0.15.9 ### Enhancements @@ -8,6 +8,7 @@ ### Fixes +* **Check storage contents for OLE file type detection** Updates `detect_filetype` to check the content of OLE files to more reliable differentiate DOC, PPT, XLS, and MSG files. As part of this, the `"msg"` extra was removed because the `python-oxmsg` package is now a base dependency. * **Fix disk space leaks and Windows errors when accessing file.name on a NamedTemporaryFile** Uses of `NamedTemporaryFile(..., delete=False)` and/or uses of `file.name` of NamedTemporaryFiles have been replaced with TemporaryFileDirectory to avoid a known issue: https://docs.python.org/3/library/tempfile.html#tempfile.NamedTemporaryFile ## 0.15.8 diff --git a/Makefile b/Makefile index f856ee5526..853e7ebd45 100644 --- a/Makefile +++ b/Makefile @@ -83,10 +83,6 @@ install-pypandoc: install-markdown: python3 -m pip install -r requirements/extra-markdown.txt -.PHONY: install-msg -install-msg: - python3 -m pip install -r requirements/extra-msg.txt - .PHONY: install-pdf-image install-pdf-image: python3 -m pip install -r requirements/extra-pdf-image.txt @@ -100,7 +96,7 @@ install-xlsx: python3 -m pip install -r requirements/extra-xlsx.txt .PHONY: install-all-docs -install-all-docs: install-base install-csv install-docx install-epub install-odt install-pypandoc install-markdown install-msg install-pdf-image install-pptx install-xlsx +install-all-docs: install-base install-csv install-docx install-epub install-odt install-pypandoc install-markdown install-pdf-image install-pptx install-xlsx .PHONY: install-all-ingest install-all-ingest: @@ -343,12 +339,6 @@ test-extra-epub: test-extra-markdown: PYTHONPATH=. CI=$(CI) pytest test_unstructured/partition/test_md.py -.PHONY: test-extra-msg -test-extra-msg: - # NOTE(scanny): exclude attachment test because partitioning attachments requires other extras - PYTHONPATH=. CI=$(CI) pytest test_unstructured/partition/test_msg.py \ - -k "not test_partition_msg_can_process_attachments" - .PHONY: test-extra-odt test-extra-odt: PYTHONPATH=. CI=$(CI) pytest test_unstructured/partition/test_odt.py diff --git a/requirements/base.in b/requirements/base.in index 71e6f4d521..6f3be98c91 100644 --- a/requirements/base.in +++ b/requirements/base.in @@ -21,3 +21,4 @@ unstructured-client wrapt tqdm psutil +python-oxmsg diff --git a/requirements/base.txt b/requirements/base.txt index ba6bf71665..c3866d82f6 100644 --- a/requirements/base.txt +++ b/requirements/base.txt @@ -10,7 +10,7 @@ backoff==2.2.1 # via -r ./base.in beautifulsoup4==4.12.3 # via -r ./base.in -certifi==2024.7.4 +certifi==2024.8.30 # via # httpcore # httpx @@ -23,7 +23,9 @@ charset-normalizer==3.3.2 # requests # unstructured-client click==8.1.7 - # via nltk + # via + # nltk + # python-oxmsg dataclasses-json==0.6.7 # via # -r ./base.in @@ -70,6 +72,8 @@ nltk==3.9.1 # via -r ./base.in numpy==1.26.4 # via -r ./base.in +olefile==0.47 + # via python-oxmsg orderly-set==5.2.2 # via deepdiff packaging==24.1 @@ -86,6 +90,8 @@ python-iso639==2024.4.27 # via -r ./base.in python-magic==0.4.27 # via -r ./base.in +python-oxmsg==0.0.1 + # via -r ./base.in rapidfuzz==3.9.6 # via -r ./base.in regex==2024.7.24 @@ -120,6 +126,7 @@ typing-extensions==4.12.2 # anyio # emoji # pypdf + # python-oxmsg # typing-inspect # unstructured-client typing-inspect==0.9.0 @@ -128,7 +135,7 @@ typing-inspect==0.9.0 # unstructured-client unstructured-client==0.25.5 # via -r ./base.in -urllib3==1.26.19 +urllib3==1.26.20 # via # -c ././deps/constraints.txt # requests diff --git a/requirements/dev.txt b/requirements/dev.txt index 2088f7988b..d0f2c130e8 100644 --- a/requirements/dev.txt +++ b/requirements/dev.txt @@ -34,7 +34,7 @@ bleach==6.1.0 # via nbconvert build==1.2.1 # via pip-tools -certifi==2024.7.4 +certifi==2024.8.30 # via # -c ./base.txt # -c ./test.txt @@ -130,7 +130,7 @@ jsonschema[format-nongpl]==3.2.0 # jupyter-events # jupyterlab-server # nbformat -jupyter==1.1.0 +jupyter==1.1.1 # via -r ./dev.in jupyter-client==7.4.9 # via @@ -370,7 +370,7 @@ typing-extensions==4.12.2 # -c ./test.txt # anyio # ipython -urllib3==1.26.19 +urllib3==1.26.20 # via # -c ././deps/constraints.txt # -c ./base.txt diff --git a/requirements/extra-msg.in b/requirements/extra-msg.in deleted file mode 100644 index b7293bda0d..0000000000 --- a/requirements/extra-msg.in +++ /dev/null @@ -1,4 +0,0 @@ --c ./deps/constraints.txt --c base.txt - -python-oxmsg diff --git a/requirements/extra-msg.txt b/requirements/extra-msg.txt deleted file mode 100644 index cb79b4f21c..0000000000 --- a/requirements/extra-msg.txt +++ /dev/null @@ -1,18 +0,0 @@ -# -# This file is autogenerated by pip-compile with Python 3.9 -# by the following command: -# -# pip-compile ./extra-msg.in -# -click==8.1.7 - # via - # -c ./base.txt - # python-oxmsg -olefile==0.47 - # via python-oxmsg -python-oxmsg==0.0.1 - # via -r ./extra-msg.in -typing-extensions==4.12.2 - # via - # -c ./base.txt - # python-oxmsg diff --git a/requirements/extra-paddleocr.txt b/requirements/extra-paddleocr.txt index 89e98332ea..7f226acbf0 100644 --- a/requirements/extra-paddleocr.txt +++ b/requirements/extra-paddleocr.txt @@ -10,7 +10,7 @@ anyio==4.4.0 # httpx astor==0.8.1 # via paddlepaddle -certifi==2024.7.4 +certifi==2024.8.30 # via # -c ./base.txt # httpcore @@ -170,7 +170,7 @@ typing-extensions==4.12.2 # paddlepaddle unstructured-paddleocr==2.8.1.0 # via -r ./extra-paddleocr.in -urllib3==1.26.19 +urllib3==1.26.20 # via # -c ././deps/constraints.txt # -c ./base.txt diff --git a/requirements/extra-pdf-image.txt b/requirements/extra-pdf-image.txt index e3ae896295..492c7d7d4c 100644 --- a/requirements/extra-pdf-image.txt +++ b/requirements/extra-pdf-image.txt @@ -8,7 +8,7 @@ antlr4-python3-runtime==4.9.3 # via omegaconf cachetools==5.5.0 # via google-auth -certifi==2024.7.4 +certifi==2024.8.30 # via # -c ./base.txt # requests @@ -279,7 +279,7 @@ unstructured-inference==0.7.36 # via -r ./extra-pdf-image.in unstructured-pytesseract==0.3.13 # via -r ./extra-pdf-image.in -urllib3==1.26.19 +urllib3==1.26.20 # via # -c ././deps/constraints.txt # -c ./base.txt diff --git a/requirements/huggingface.txt b/requirements/huggingface.txt index 05af7ee05a..1eda48ff69 100644 --- a/requirements/huggingface.txt +++ b/requirements/huggingface.txt @@ -4,7 +4,7 @@ # # pip-compile ./huggingface.in # -certifi==2024.7.4 +certifi==2024.8.30 # via # -c ./base.txt # requests @@ -103,7 +103,7 @@ typing-extensions==4.12.2 # -c ./base.txt # huggingface-hub # torch -urllib3==1.26.19 +urllib3==1.26.20 # via # -c ././deps/constraints.txt # -c ./base.txt diff --git a/requirements/ingest/airtable.txt b/requirements/ingest/airtable.txt index 121e3a7e7f..7dd887296c 100644 --- a/requirements/ingest/airtable.txt +++ b/requirements/ingest/airtable.txt @@ -6,7 +6,7 @@ # annotated-types==0.7.0 # via pydantic -certifi==2024.7.4 +certifi==2024.8.30 # via # -c ./ingest/../base.txt # requests @@ -36,7 +36,7 @@ typing-extensions==4.12.2 # pyairtable # pydantic # pydantic-core -urllib3==1.26.19 +urllib3==1.26.20 # via # -c ./ingest/../base.txt # -c ./ingest/../deps/constraints.txt diff --git a/requirements/ingest/astradb.txt b/requirements/ingest/astradb.txt index 7edbc3bb90..b86a2bad5e 100644 --- a/requirements/ingest/astradb.txt +++ b/requirements/ingest/astradb.txt @@ -14,7 +14,7 @@ cassandra-driver==3.29.1 # via cassio cassio==0.1.8 # via astrapy -certifi==2024.7.4 +certifi==2024.8.30 # via # -c ./ingest/../base.txt # httpcore @@ -91,7 +91,7 @@ typing-extensions==4.12.2 # via # -c ./ingest/../base.txt # anyio -urllib3==1.26.19 +urllib3==1.26.20 # via # -c ./ingest/../base.txt # -c ./ingest/../deps/constraints.txt diff --git a/requirements/ingest/azure-cognitive-search.txt b/requirements/ingest/azure-cognitive-search.txt index a09121663d..00e75efb7c 100644 --- a/requirements/ingest/azure-cognitive-search.txt +++ b/requirements/ingest/azure-cognitive-search.txt @@ -10,7 +10,7 @@ azure-core==1.30.2 # via azure-search-documents azure-search-documents==11.5.1 # via -r ./ingest/azure-cognitive-search.in -certifi==2024.7.4 +certifi==2024.8.30 # via # -c ./ingest/../base.txt # requests @@ -38,7 +38,7 @@ typing-extensions==4.12.2 # -c ./ingest/../base.txt # azure-core # azure-search-documents -urllib3==1.26.19 +urllib3==1.26.20 # via # -c ./ingest/../base.txt # -c ./ingest/../deps/constraints.txt diff --git a/requirements/ingest/azure.txt b/requirements/ingest/azure.txt index 4be368a7e2..989b92879f 100644 --- a/requirements/ingest/azure.txt +++ b/requirements/ingest/azure.txt @@ -27,7 +27,7 @@ azure-identity==1.17.1 # via adlfs azure-storage-blob==12.22.0 # via adlfs -certifi==2024.7.4 +certifi==2024.8.30 # via # -c ./ingest/../base.txt # requests @@ -94,7 +94,7 @@ typing-extensions==4.12.2 # azure-core # azure-identity # azure-storage-blob -urllib3==1.26.19 +urllib3==1.26.20 # via # -c ./ingest/../base.txt # -c ./ingest/../deps/constraints.txt diff --git a/requirements/ingest/box.txt b/requirements/ingest/box.txt index 23c8d2b034..17cf46ce4b 100644 --- a/requirements/ingest/box.txt +++ b/requirements/ingest/box.txt @@ -10,7 +10,7 @@ boxfs==0.3.0 # via -r ./ingest/box.in boxsdk[jwt]==3.13.0 # via boxfs -certifi==2024.7.4 +certifi==2024.8.30 # via # -c ./ingest/../base.txt # requests @@ -51,7 +51,7 @@ six==1.16.0 # via # -c ./ingest/../base.txt # python-dateutil -urllib3==1.26.19 +urllib3==1.26.20 # via # -c ./ingest/../base.txt # -c ./ingest/../deps/constraints.txt diff --git a/requirements/ingest/chroma.txt b/requirements/ingest/chroma.txt index 3fe1760ec0..92d7a83c68 100644 --- a/requirements/ingest/chroma.txt +++ b/requirements/ingest/chroma.txt @@ -24,7 +24,7 @@ build==1.2.1 # via chromadb cachetools==5.5.0 # via google-auth -certifi==2024.7.4 +certifi==2024.8.30 # via # -c ./ingest/../base.txt # httpcore @@ -268,7 +268,7 @@ typing-extensions==4.12.2 # starlette # typer # uvicorn -urllib3==1.26.19 +urllib3==1.26.20 # via # -c ./ingest/../base.txt # -c ./ingest/../deps/constraints.txt diff --git a/requirements/ingest/clarifai.txt b/requirements/ingest/clarifai.txt index 38a8672554..4f4adeda9d 100644 --- a/requirements/ingest/clarifai.txt +++ b/requirements/ingest/clarifai.txt @@ -4,7 +4,7 @@ # # pip-compile ./ingest/clarifai.in # -certifi==2024.7.4 +certifi==2024.8.30 # via # -c ./ingest/../base.txt # requests @@ -74,7 +74,7 @@ tqdm==4.66.5 # clarifai tritonclient==2.41.1 # via clarifai -urllib3==1.26.19 +urllib3==1.26.20 # via # -c ./ingest/../base.txt # -c ./ingest/../deps/constraints.txt diff --git a/requirements/ingest/confluence.txt b/requirements/ingest/confluence.txt index f5bf5073f4..c90b20225c 100644 --- a/requirements/ingest/confluence.txt +++ b/requirements/ingest/confluence.txt @@ -4,13 +4,13 @@ # # pip-compile ./ingest/confluence.in # -atlassian-python-api==3.41.14 +atlassian-python-api==3.41.15 # via -r ./ingest/confluence.in beautifulsoup4==4.12.3 # via # -c ./ingest/../base.txt # atlassian-python-api -certifi==2024.7.4 +certifi==2024.8.30 # via # -c ./ingest/../base.txt # requests @@ -45,7 +45,7 @@ soupsieve==2.6 # via # -c ./ingest/../base.txt # beautifulsoup4 -urllib3==1.26.19 +urllib3==1.26.20 # via # -c ./ingest/../base.txt # -c ./ingest/../deps/constraints.txt diff --git a/requirements/ingest/databricks-volumes.txt b/requirements/ingest/databricks-volumes.txt index 5aa989e9ab..c5f5c06eba 100644 --- a/requirements/ingest/databricks-volumes.txt +++ b/requirements/ingest/databricks-volumes.txt @@ -6,7 +6,7 @@ # cachetools==5.5.0 # via google-auth -certifi==2024.7.4 +certifi==2024.8.30 # via # -c ./ingest/../base.txt # requests @@ -34,7 +34,7 @@ requests==2.32.3 # databricks-sdk rsa==4.9 # via google-auth -urllib3==1.26.19 +urllib3==1.26.20 # via # -c ./ingest/../base.txt # -c ./ingest/../deps/constraints.txt diff --git a/requirements/ingest/dropbox.txt b/requirements/ingest/dropbox.txt index 0a4a8bc14b..df17ab45be 100644 --- a/requirements/ingest/dropbox.txt +++ b/requirements/ingest/dropbox.txt @@ -4,7 +4,7 @@ # # pip-compile ./ingest/dropbox.in # -certifi==2024.7.4 +certifi==2024.8.30 # via # -c ./ingest/../base.txt # requests @@ -38,7 +38,7 @@ six==1.16.0 # stone stone==3.3.1 # via dropbox -urllib3==1.26.19 +urllib3==1.26.20 # via # -c ./ingest/../base.txt # -c ./ingest/../deps/constraints.txt diff --git a/requirements/ingest/elasticsearch.txt b/requirements/ingest/elasticsearch.txt index eada258046..012edb676b 100644 --- a/requirements/ingest/elasticsearch.txt +++ b/requirements/ingest/elasticsearch.txt @@ -14,7 +14,7 @@ async-timeout==4.0.3 # via aiohttp attrs==24.2.0 # via aiohttp -certifi==2024.7.4 +certifi==2024.8.30 # via # -c ./ingest/../base.txt # elastic-transport @@ -34,7 +34,7 @@ multidict==6.0.5 # via # aiohttp # yarl -urllib3==1.26.19 +urllib3==1.26.20 # via # -c ./ingest/../base.txt # -c ./ingest/../deps/constraints.txt diff --git a/requirements/ingest/embed-aws-bedrock.txt b/requirements/ingest/embed-aws-bedrock.txt index fdc7facd66..c0263a6b75 100644 --- a/requirements/ingest/embed-aws-bedrock.txt +++ b/requirements/ingest/embed-aws-bedrock.txt @@ -31,7 +31,7 @@ botocore==1.34.131 # -c ./ingest/../deps/constraints.txt # boto3 # s3transfer -certifi==2024.7.4 +certifi==2024.8.30 # via # -c ./ingest/../base.txt # httpcore @@ -93,7 +93,7 @@ langchain-core==0.2.36 # langchain-text-splitters langchain-text-splitters==0.2.2 # via langchain -langsmith==0.1.106 +langsmith==0.1.107 # via # langchain # langchain-community @@ -177,7 +177,7 @@ typing-inspect==0.9.0 # via # -c ./ingest/../base.txt # dataclasses-json -urllib3==1.26.19 +urllib3==1.26.20 # via # -c ./ingest/../base.txt # -c ./ingest/../deps/constraints.txt diff --git a/requirements/ingest/embed-huggingface.txt b/requirements/ingest/embed-huggingface.txt index f284cdf882..8bfcfbd208 100644 --- a/requirements/ingest/embed-huggingface.txt +++ b/requirements/ingest/embed-huggingface.txt @@ -10,7 +10,7 @@ anyio==4.4.0 # via # -c ./ingest/../base.txt # httpx -certifi==2024.7.4 +certifi==2024.8.30 # via # -c ./ingest/../base.txt # httpcore @@ -71,7 +71,7 @@ langchain-core==0.2.36 # via langchain-huggingface langchain-huggingface==0.0.3 # via -r ./ingest/embed-huggingface.in -langsmith==0.1.106 +langsmith==0.1.107 # via langchain-core markupsafe==2.1.5 # via jinja2 @@ -164,7 +164,7 @@ typing-extensions==4.12.2 # pydantic # pydantic-core # torch -urllib3==1.26.19 +urllib3==1.26.20 # via # -c ./ingest/../base.txt # -c ./ingest/../deps/constraints.txt diff --git a/requirements/ingest/embed-mixedbreadai.txt b/requirements/ingest/embed-mixedbreadai.txt index 19efbfbb90..a887513a15 100644 --- a/requirements/ingest/embed-mixedbreadai.txt +++ b/requirements/ingest/embed-mixedbreadai.txt @@ -10,7 +10,7 @@ anyio==4.4.0 # via # -c ./ingest/../base.txt # httpx -certifi==2024.7.4 +certifi==2024.8.30 # via # -c ./ingest/../base.txt # httpcore diff --git a/requirements/ingest/embed-octoai.txt b/requirements/ingest/embed-octoai.txt index fb4a40c1dc..fd2c70e503 100644 --- a/requirements/ingest/embed-octoai.txt +++ b/requirements/ingest/embed-octoai.txt @@ -11,7 +11,7 @@ anyio==4.4.0 # -c ./ingest/../base.txt # httpx # openai -certifi==2024.7.4 +certifi==2024.8.30 # via # -c ./ingest/../base.txt # httpcore @@ -47,7 +47,7 @@ idna==3.8 # requests jiter==0.5.0 # via openai -openai==1.42.0 +openai==1.43.0 # via -r ./ingest/embed-octoai.in pydantic==2.8.2 # via openai @@ -80,7 +80,7 @@ typing-extensions==4.12.2 # openai # pydantic # pydantic-core -urllib3==1.26.19 +urllib3==1.26.20 # via # -c ./ingest/../base.txt # -c ./ingest/../deps/constraints.txt diff --git a/requirements/ingest/embed-openai.txt b/requirements/ingest/embed-openai.txt index fa0ad936de..fb7d4f47d6 100644 --- a/requirements/ingest/embed-openai.txt +++ b/requirements/ingest/embed-openai.txt @@ -11,7 +11,7 @@ anyio==4.4.0 # -c ./ingest/../base.txt # httpx # openai -certifi==2024.7.4 +certifi==2024.8.30 # via # -c ./ingest/../base.txt # httpcore @@ -56,9 +56,9 @@ langchain-core==0.2.36 # via langchain-openai langchain-openai==0.1.23 # via -r ./ingest/embed-openai.in -langsmith==0.1.106 +langsmith==0.1.107 # via langchain-core -openai==1.42.0 +openai==1.43.0 # via langchain-openai orjson==3.10.7 # via langsmith @@ -106,7 +106,7 @@ typing-extensions==4.12.2 # openai # pydantic # pydantic-core -urllib3==1.26.19 +urllib3==1.26.20 # via # -c ./ingest/../base.txt # -c ./ingest/../deps/constraints.txt diff --git a/requirements/ingest/embed-vertexai.txt b/requirements/ingest/embed-vertexai.txt index ba23ab3998..8c6be7b5cc 100644 --- a/requirements/ingest/embed-vertexai.txt +++ b/requirements/ingest/embed-vertexai.txt @@ -26,7 +26,7 @@ attrs==24.2.0 # via aiohttp cachetools==5.5.0 # via google-auth -certifi==2024.7.4 +certifi==2024.8.30 # via # -c ./ingest/../base.txt # httpcore @@ -147,7 +147,7 @@ langchain-google-vertexai==1.0.10 # via -r ./ingest/embed-vertexai.in langchain-text-splitters==0.2.2 # via langchain -langsmith==0.1.106 +langsmith==0.1.107 # via # langchain # langchain-community @@ -261,7 +261,7 @@ typing-inspect==0.9.0 # via # -c ./ingest/../base.txt # dataclasses-json -urllib3==1.26.19 +urllib3==1.26.20 # via # -c ./ingest/../base.txt # -c ./ingest/../deps/constraints.txt diff --git a/requirements/ingest/embed-voyageai.txt b/requirements/ingest/embed-voyageai.txt index a95b286990..af8a43573e 100644 --- a/requirements/ingest/embed-voyageai.txt +++ b/requirements/ingest/embed-voyageai.txt @@ -26,7 +26,7 @@ async-timeout==4.0.3 # langchain attrs==24.2.0 # via aiohttp -certifi==2024.7.4 +certifi==2024.8.30 # via # -c ./ingest/../base.txt # httpcore @@ -78,7 +78,7 @@ langchain-text-splitters==0.2.2 # via langchain langchain-voyageai==0.1.1 # via -r ./ingest/embed-voyageai.in -langsmith==0.1.106 +langsmith==0.1.107 # via # langchain # langchain-core @@ -134,7 +134,7 @@ typing-extensions==4.12.2 # pydantic # pydantic-core # sqlalchemy -urllib3==1.26.19 +urllib3==1.26.20 # via # -c ./ingest/../base.txt # -c ./ingest/../deps/constraints.txt diff --git a/requirements/ingest/gcs.txt b/requirements/ingest/gcs.txt index 5f71483c28..07e0c880c4 100644 --- a/requirements/ingest/gcs.txt +++ b/requirements/ingest/gcs.txt @@ -22,7 +22,7 @@ bs4==0.0.2 # via -r ./ingest/gcs.in cachetools==5.5.0 # via google-auth -certifi==2024.7.4 +certifi==2024.8.30 # via # -c ./ingest/../base.txt # requests @@ -107,7 +107,7 @@ soupsieve==2.6 # via # -c ./ingest/../base.txt # beautifulsoup4 -urllib3==1.26.19 +urllib3==1.26.20 # via # -c ./ingest/../base.txt # -c ./ingest/../deps/constraints.txt diff --git a/requirements/ingest/github.txt b/requirements/ingest/github.txt index 593b877f0e..f7fe0b9b87 100644 --- a/requirements/ingest/github.txt +++ b/requirements/ingest/github.txt @@ -4,7 +4,7 @@ # # pip-compile ./ingest/github.in # -certifi==2024.7.4 +certifi==2024.8.30 # via # -c ./ingest/../base.txt # requests @@ -40,7 +40,7 @@ typing-extensions==4.12.2 # via # -c ./ingest/../base.txt # pygithub -urllib3==1.26.19 +urllib3==1.26.20 # via # -c ./ingest/../base.txt # -c ./ingest/../deps/constraints.txt diff --git a/requirements/ingest/gitlab.txt b/requirements/ingest/gitlab.txt index 22bba7dd18..11dac4c93b 100644 --- a/requirements/ingest/gitlab.txt +++ b/requirements/ingest/gitlab.txt @@ -4,7 +4,7 @@ # # pip-compile ./ingest/gitlab.in # -certifi==2024.7.4 +certifi==2024.8.30 # via # -c ./ingest/../base.txt # requests @@ -27,7 +27,7 @@ requests-toolbelt==1.0.0 # via # -c ./ingest/../base.txt # python-gitlab -urllib3==1.26.19 +urllib3==1.26.20 # via # -c ./ingest/../base.txt # -c ./ingest/../deps/constraints.txt diff --git a/requirements/ingest/google-drive.txt b/requirements/ingest/google-drive.txt index 495fa52d24..b2bc81469c 100644 --- a/requirements/ingest/google-drive.txt +++ b/requirements/ingest/google-drive.txt @@ -6,7 +6,7 @@ # cachetools==5.5.0 # via google-auth -certifi==2024.7.4 +certifi==2024.8.30 # via # -c ./ingest/../base.txt # requests @@ -59,7 +59,7 @@ rsa==4.9 # via google-auth uritemplate==4.1.1 # via google-api-python-client -urllib3==1.26.19 +urllib3==1.26.20 # via # -c ./ingest/../base.txt # -c ./ingest/../deps/constraints.txt diff --git a/requirements/ingest/hubspot.txt b/requirements/ingest/hubspot.txt index f8a2aa5bfb..da9b63b372 100644 --- a/requirements/ingest/hubspot.txt +++ b/requirements/ingest/hubspot.txt @@ -4,7 +4,7 @@ # # pip-compile ./ingest/hubspot.in # -certifi==2024.7.4 +certifi==2024.8.30 # via # -c ./ingest/../base.txt # hubspot-api-client @@ -19,7 +19,7 @@ six==1.16.0 # -c ./ingest/../base.txt # hubspot-api-client # python-dateutil -urllib3==1.26.19 +urllib3==1.26.20 # via # -c ./ingest/../base.txt # -c ./ingest/../deps/constraints.txt diff --git a/requirements/ingest/jira.txt b/requirements/ingest/jira.txt index cf24ccf063..36ba7fbb14 100644 --- a/requirements/ingest/jira.txt +++ b/requirements/ingest/jira.txt @@ -4,13 +4,13 @@ # # pip-compile ./ingest/jira.in # -atlassian-python-api==3.41.14 +atlassian-python-api==3.41.15 # via -r ./ingest/jira.in beautifulsoup4==4.12.3 # via # -c ./ingest/../base.txt # atlassian-python-api -certifi==2024.7.4 +certifi==2024.8.30 # via # -c ./ingest/../base.txt # requests @@ -45,7 +45,7 @@ soupsieve==2.6 # via # -c ./ingest/../base.txt # beautifulsoup4 -urllib3==1.26.19 +urllib3==1.26.20 # via # -c ./ingest/../base.txt # -c ./ingest/../deps/constraints.txt diff --git a/requirements/ingest/notion.txt b/requirements/ingest/notion.txt index 0fcdf0f10b..41d0ccb878 100644 --- a/requirements/ingest/notion.txt +++ b/requirements/ingest/notion.txt @@ -8,7 +8,7 @@ anyio==4.4.0 # via # -c ./ingest/../base.txt # httpx -certifi==2024.7.4 +certifi==2024.8.30 # via # -c ./ingest/../base.txt # httpcore diff --git a/requirements/ingest/onedrive.txt b/requirements/ingest/onedrive.txt index e2d4b79663..776363f699 100644 --- a/requirements/ingest/onedrive.txt +++ b/requirements/ingest/onedrive.txt @@ -10,7 +10,7 @@ beautifulsoup4==4.12.3 # bs4 bs4==0.0.2 # via -r ./ingest/onedrive.in -certifi==2024.7.4 +certifi==2024.8.30 # via # -c ./ingest/../base.txt # requests @@ -51,7 +51,7 @@ soupsieve==2.6 # via # -c ./ingest/../base.txt # beautifulsoup4 -urllib3==1.26.19 +urllib3==1.26.20 # via # -c ./ingest/../base.txt # -c ./ingest/../deps/constraints.txt diff --git a/requirements/ingest/opensearch.txt b/requirements/ingest/opensearch.txt index 02c8d9028f..2fd83471e7 100644 --- a/requirements/ingest/opensearch.txt +++ b/requirements/ingest/opensearch.txt @@ -4,7 +4,7 @@ # # pip-compile ./ingest/opensearch.in # -certifi==2024.7.4 +certifi==2024.8.30 # via # -c ./ingest/../base.txt # opensearch-py @@ -33,7 +33,7 @@ six==1.16.0 # via # -c ./ingest/../base.txt # python-dateutil -urllib3==1.26.19 +urllib3==1.26.20 # via # -c ./ingest/../base.txt # -c ./ingest/../deps/constraints.txt diff --git a/requirements/ingest/outlook.txt b/requirements/ingest/outlook.txt index a93de4a611..a729926c64 100644 --- a/requirements/ingest/outlook.txt +++ b/requirements/ingest/outlook.txt @@ -4,7 +4,7 @@ # # pip-compile ./ingest/outlook.in # -certifi==2024.7.4 +certifi==2024.8.30 # via # -c ./ingest/../base.txt # requests @@ -41,7 +41,7 @@ requests==2.32.3 # -c ./ingest/../base.txt # msal # office365-rest-python-client -urllib3==1.26.19 +urllib3==1.26.20 # via # -c ./ingest/../base.txt # -c ./ingest/../deps/constraints.txt diff --git a/requirements/ingest/pinecone.txt b/requirements/ingest/pinecone.txt index 730436939a..c9727fab43 100644 --- a/requirements/ingest/pinecone.txt +++ b/requirements/ingest/pinecone.txt @@ -4,7 +4,7 @@ # # pip-compile ./ingest/pinecone.in # -certifi==2024.7.4 +certifi==2024.8.30 # via # -c ./ingest/../base.txt # pinecone-client @@ -24,7 +24,7 @@ typing-extensions==4.12.2 # via # -c ./ingest/../base.txt # pinecone-client -urllib3==1.26.19 +urllib3==1.26.20 # via # -c ./ingest/../base.txt # -c ./ingest/../deps/constraints.txt diff --git a/requirements/ingest/qdrant.txt b/requirements/ingest/qdrant.txt index a3b6fb3082..875103bd45 100644 --- a/requirements/ingest/qdrant.txt +++ b/requirements/ingest/qdrant.txt @@ -10,7 +10,7 @@ anyio==4.4.0 # via # -c ./ingest/../base.txt # httpx -certifi==2024.7.4 +certifi==2024.8.30 # via # -c ./ingest/../base.txt # httpcore @@ -76,7 +76,7 @@ typing-extensions==4.12.2 # anyio # pydantic # pydantic-core -urllib3==1.26.19 +urllib3==1.26.20 # via # -c ./ingest/../base.txt # -c ./ingest/../deps/constraints.txt diff --git a/requirements/ingest/reddit.txt b/requirements/ingest/reddit.txt index ff91de1afc..b7e19a9b7f 100644 --- a/requirements/ingest/reddit.txt +++ b/requirements/ingest/reddit.txt @@ -4,7 +4,7 @@ # # pip-compile ./ingest/reddit.in # -certifi==2024.7.4 +certifi==2024.8.30 # via # -c ./ingest/../base.txt # requests @@ -27,7 +27,7 @@ requests==2.32.3 # update-checker update-checker==0.18.0 # via praw -urllib3==1.26.19 +urllib3==1.26.20 # via # -c ./ingest/../base.txt # -c ./ingest/../deps/constraints.txt diff --git a/requirements/ingest/s3.txt b/requirements/ingest/s3.txt index c1221906a7..fbb13953b6 100644 --- a/requirements/ingest/s3.txt +++ b/requirements/ingest/s3.txt @@ -56,7 +56,7 @@ typing-extensions==4.12.2 # via # -c ./ingest/../base.txt # aioitertools -urllib3==1.26.19 +urllib3==1.26.20 # via # -c ./ingest/../base.txt # -c ./ingest/../deps/constraints.txt diff --git a/requirements/ingest/salesforce.txt b/requirements/ingest/salesforce.txt index c9e24f7938..8baaab2cd8 100644 --- a/requirements/ingest/salesforce.txt +++ b/requirements/ingest/salesforce.txt @@ -6,7 +6,7 @@ # attrs==24.2.0 # via zeep -certifi==2024.7.4 +certifi==2024.8.30 # via # -c ./ingest/../base.txt # requests @@ -63,7 +63,7 @@ typing-extensions==4.12.2 # via # -c ./ingest/../base.txt # simple-salesforce -urllib3==1.26.19 +urllib3==1.26.20 # via # -c ./ingest/../base.txt # -c ./ingest/../deps/constraints.txt diff --git a/requirements/ingest/sharepoint.txt b/requirements/ingest/sharepoint.txt index ac6f2d0e99..1514336011 100644 --- a/requirements/ingest/sharepoint.txt +++ b/requirements/ingest/sharepoint.txt @@ -4,7 +4,7 @@ # # pip-compile ./ingest/sharepoint.in # -certifi==2024.7.4 +certifi==2024.8.30 # via # -c ./ingest/../base.txt # requests @@ -41,7 +41,7 @@ requests==2.32.3 # -c ./ingest/../base.txt # msal # office365-rest-python-client -urllib3==1.26.19 +urllib3==1.26.20 # via # -c ./ingest/../base.txt # -c ./ingest/../deps/constraints.txt diff --git a/requirements/ingest/singlestore.txt b/requirements/ingest/singlestore.txt index 8f31a3bab1..529834f4ad 100644 --- a/requirements/ingest/singlestore.txt +++ b/requirements/ingest/singlestore.txt @@ -6,7 +6,7 @@ # build==1.2.1 # via singlestoredb -certifi==2024.7.4 +certifi==2024.8.30 # via # -c ./ingest/../base.txt # requests @@ -46,7 +46,7 @@ tomli==2.0.1 # via # build # singlestoredb -urllib3==1.26.19 +urllib3==1.26.20 # via # -c ./ingest/../base.txt # -c ./ingest/../deps/constraints.txt diff --git a/requirements/ingest/weaviate.txt b/requirements/ingest/weaviate.txt index b748273f15..451fd4850c 100644 --- a/requirements/ingest/weaviate.txt +++ b/requirements/ingest/weaviate.txt @@ -6,7 +6,7 @@ # authlib==1.3.2 # via weaviate-client -certifi==2024.7.4 +certifi==2024.8.30 # via # -c ./ingest/../base.txt # requests @@ -28,7 +28,7 @@ requests==2.32.3 # via # -c ./ingest/../base.txt # weaviate-client -urllib3==1.26.19 +urllib3==1.26.20 # via # -c ./ingest/../base.txt # -c ./ingest/../deps/constraints.txt diff --git a/requirements/ingest/wikipedia.txt b/requirements/ingest/wikipedia.txt index 0f6e15e8fa..ef53424d35 100644 --- a/requirements/ingest/wikipedia.txt +++ b/requirements/ingest/wikipedia.txt @@ -8,7 +8,7 @@ beautifulsoup4==4.12.3 # via # -c ./ingest/../base.txt # wikipedia -certifi==2024.7.4 +certifi==2024.8.30 # via # -c ./ingest/../base.txt # requests @@ -28,7 +28,7 @@ soupsieve==2.6 # via # -c ./ingest/../base.txt # beautifulsoup4 -urllib3==1.26.19 +urllib3==1.26.20 # via # -c ./ingest/../base.txt # -c ./ingest/../deps/constraints.txt diff --git a/requirements/test.txt b/requirements/test.txt index 983df7294c..9b562a1ded 100644 --- a/requirements/test.txt +++ b/requirements/test.txt @@ -18,7 +18,7 @@ autoflake==2.3.1 # via -r ./test.in black==24.8.0 # via -r ./test.in -certifi==2024.7.4 +certifi==2024.8.30 # via # -c ./base.txt # httpcore @@ -221,7 +221,7 @@ tzdata==2024.1 # via pandas ujson==5.10.0 # via label-studio-sdk -urllib3==1.26.19 +urllib3==1.26.20 # via # -c ././deps/constraints.txt # -c ./base.txt diff --git a/setup.py b/setup.py index b78e0c6f26..b0145704e4 100644 --- a/setup.py +++ b/setup.py @@ -46,7 +46,6 @@ def load_requirements(file_list: Optional[Union[str, List[str]]] = None) -> List epub_reqs = load_requirements("requirements/extra-epub.in") image_reqs = load_requirements("requirements/extra-pdf-image.in") markdown_reqs = load_requirements("requirements/extra-markdown.in") -msg_reqs = load_requirements("requirements/extra-msg.in") odt_reqs = load_requirements("requirements/extra-odt.in") org_reqs = load_requirements("requirements/extra-pandoc.in") pdf_reqs = load_requirements("requirements/extra-pdf-image.in") @@ -64,7 +63,6 @@ def load_requirements(file_list: Optional[Union[str, List[str]]] = None) -> List + epub_reqs + image_reqs + markdown_reqs - + msg_reqs + odt_reqs + org_reqs + pdf_reqs @@ -117,7 +115,6 @@ def load_requirements(file_list: Optional[Union[str, List[str]]] = None) -> List "epub": epub_reqs, "image": image_reqs, "md": markdown_reqs, - "msg": msg_reqs, "odt": odt_reqs, "org": org_reqs, "pdf": pdf_reqs, diff --git a/test_unstructured/file_utils/test_filetype.py b/test_unstructured/file_utils/test_filetype.py index 8c363838c3..933882f9e2 100644 --- a/test_unstructured/file_utils/test_filetype.py +++ b/test_unstructured/file_utils/test_filetype.py @@ -386,6 +386,23 @@ def test_it_detects_correct_file_type_from_OLE_file_no_name_with_wrong_guessed_m assert file_type is expected_value +@pytest.mark.parametrize( + ("filename", "mime_type", "expected"), + [ + ("fake.doc", "application/vnd.ms-excel", FileType.DOC), + ("fake-power-point.ppt", "application/vnd.ms-excel", FileType.PPT), + ("tests-example.xls", "application/msword", FileType.XLS), + ("fake-email.msg", "application/vnd.ms-excel", FileType.MSG), + ], +) +def test_ole_file_structure_trusted_over_mime_type_guess(filename, mime_type, expected): + def _guess_mime(*args, **kwargs): + return mime_type + + with patch("filetype.guess_mime", _guess_mime): + detect_filetype(example_doc_path(filename)) == expected + + @pytest.mark.parametrize( ("expected_value", "file_name"), [ @@ -530,21 +547,6 @@ def test_it_falls_back_to_extension_strategy_when_prior_strategies_fail( # ================================================================================================ -@pytest.mark.parametrize( - ("metadata_file_path", "expected_value"), - [ - ("fake-email.msg", FileType.MSG), - ("fake-email.msg.outlook", FileType.UNK), - ], -) -def test_it_can_only_detect_MSG_format_by_extension( - metadata_file_path: str, expected_value: FileType -): - with open(example_doc_path("fake-email.msg"), "rb") as f: - file = io.BytesIO(f.read()) - assert detect_filetype(file=file, metadata_file_path=metadata_file_path) == expected_value - - @pytest.mark.parametrize("mime_type", ["application/xml", "text/xml"]) @pytest.mark.parametrize("extension", [".html", ".htm"]) def test_it_detects_HTML_from_guessed_mime_type_ending_with_xml_and_html_extension( @@ -1028,7 +1030,7 @@ def and_it_returns_None_when_ole_differentiation_is_not_applicable_to_the_mime_t ("simple.doc", FileType.DOC), ("fake-power-point.ppt", FileType.PPT), ("tests-example.xls", FileType.XLS), - ("fake-email.msg", None), + ("fake-email.msg", FileType.MSG), ("README.org", None), ], ) @@ -1043,6 +1045,26 @@ def it_distinguishes_the_file_type_of_applicable_OLE_files( assert differentiator.file_type is expected_value + @pytest.mark.parametrize( + ("file_name", "expected_value"), + [ + ("simple.doc", FileType.DOC), + ("fake-power-point.ppt", FileType.PPT), + ("tests-example.xls", FileType.XLS), + ("fake-email.msg", FileType.MSG), + ], + ) + def it_distinguishes_the_file_type_of_applicable_OLE_files_from_storage_content( + self, file_name: str, expected_value: FileType | None + ): + # -- no file-name available, just to make sure we're not relying on an extension -- + with open(example_doc_path(file_name), "rb") as f: + file = io.BytesIO(f.read()) + ctx = _FileTypeDetectionContext(file=file) + differentiator = _OleFileDifferentiator(ctx) + + assert differentiator._check_ole_file_type(ctx) is expected_value + def but_it_returns_None_to_engage_fallback_when_filetype_cannot_guess_mime( self, guess_mime_: Mock ): @@ -1052,6 +1074,8 @@ def but_it_returns_None_to_engage_fallback_when_filetype_cannot_guess_mime( file = io.BytesIO(f.read()) ctx = _FileTypeDetectionContext(file=file) differentiator = _OleFileDifferentiator(ctx) + # -- force method to return None to trigger the mime type being guessed + differentiator._check_ole_file_type = lambda ctx: None file_type = differentiator.file_type diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 5b85cacf11..eb904c1dae 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.15.9-dev1" # pragma: no cover +__version__ = "0.15.9" # pragma: no cover diff --git a/unstructured/file_utils/filetype.py b/unstructured/file_utils/filetype.py index 58f8bdbfa2..e459e72f84 100644 --- a/unstructured/file_utils/filetype.py +++ b/unstructured/file_utils/filetype.py @@ -38,6 +38,8 @@ from typing import IO, Callable, Iterator, Optional import filetype as ft +from olefile import OleFileIO +from oxmsg.storage import Storage from typing_extensions import ParamSpec from unstructured.documents.elements import Element @@ -476,6 +478,10 @@ def file_type(self) -> FileType | None: if not self._is_ole_file(self._ctx): return None + # -- check storage contents of the ole file for file type markers + if (ole_file_type := self._check_ole_file_type(self._ctx)) is not None: + return ole_file_type + # -- `filetype` lib is better at legacy MS-Office files than `libmagic`, so we rely on it # -- to differentiate those. Note `filetype` doesn't detect MSG type and won't always # -- detect DOC, PPT, or XLS, returning `None` instead. We let those fall through and we @@ -491,6 +497,24 @@ def _is_ole_file(ctx: _FileTypeDetectionContext) -> bool: with ctx.open() as file: return file.read(8) == b"\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1" + @staticmethod + def _check_ole_file_type(ctx: _FileTypeDetectionContext) -> FileType | None: + with ctx.open() as f: + ole = OleFileIO(f) + root_storage = Storage.from_ole(ole) + + for stream in root_storage.streams: + if stream.name == "WordDocument": + return FileType.DOC + elif stream.name == "PowerPoint Document": + return FileType.PPT + elif stream.name == "Workbook": + return FileType.XLS + elif stream.name == "__properties_version1.0": + return FileType.MSG + + return None + class _TextFileDifferentiator: """Refine a textual file-type that may not be as specific as it could be."""