Merge branch 'refs/heads/main' into refactor/remove-analysis-scripts

# Conflicts: # CHANGELOG.md # test_unstructured_inference/test_elements.py # unstructured_inference/__version__.py # unstructured_inference/utils.py
Unstructured-IO · Jun 25, 2024 · 6d0f23d · 6d0f23d
2 parents c3228be + 45b3be0
commit 6d0f23d
Show file tree

Hide file tree

Showing 34 changed files with 1,492 additions and 996 deletions.
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -7,7 +7,7 @@ on:
     branches: [ main ]
 
 env:
-  PYTHON_VERSION: 3.8
+  PYTHON_VERSION: 3.9
 
 jobs:
   setup:
@@ -22,7 +22,7 @@ jobs:
         key: ${{ runner.os }}-${{ env.PYTHON_VERSION }}-${{ hashFiles('requirements/*.txt') }}
         lookup-only: true
     - name: Set up Python ${{ env.PYTHON_VERSION }}
-      uses: actions/setup-python@v4
+      uses: actions/setup-python@v5
       with:
         python-version: ${{ env.PYTHON_VERSION }}
     - name: Install Poppler
@@ -104,48 +104,50 @@ jobs:
         CI=true make test
         make check-coverage
 
-  test_ingest:
-    strategy:
-      matrix:
-        python-version: ["3.8","3.9","3.10"]
-    runs-on: ubuntu-latest
-    env:
-      NLTK_DATA: ${{ github.workspace }}/nltk_data
-    needs: lint
-    steps:
-    - name: Checkout unstructured repo for integration testing
-      uses: actions/checkout@v4
-      with:
-        repository: 'Unstructured-IO/unstructured'
-    - name: Checkout this repo
-      uses: actions/checkout@v4
-      with:
-        path: inference
-    - name: Set up Python ${{ matrix.python-version }}
-      uses: actions/setup-python@v4
-      with:
-        python-version: ${{ matrix.python-version }}
-    - name: Test
-      env:
-        GH_READ_ONLY_ACCESS_TOKEN: ${{ secrets.GH_READ_ONLY_ACCESS_TOKEN }}
-        SLACK_TOKEN: ${{ secrets.SLACK_TOKEN }}
-        DISCORD_TOKEN: ${{ secrets.DISCORD_TOKEN }}
-      run: |
-        python${{ matrix.python-version }} -m venv .venv
-        source .venv/bin/activate
-        [ ! -d "$NLTK_DATA" ] && mkdir "$NLTK_DATA"
-        make install-ci
-        pip install -e inference/
-        sudo apt-get update
-        sudo apt-get install -y libmagic-dev poppler-utils libreoffice pandoc
-        sudo add-apt-repository -y ppa:alex-p/tesseract-ocr5
-        sudo apt-get install -y tesseract-ocr
-        sudo apt-get install -y tesseract-ocr-kor
-        sudo apt-get install -y diffstat
-        tesseract --version
-        make install-all-ingest
-        # only run ingest tests that check expected output diffs.
-        bash inference/scripts/test-unstructured-ingest-helper.sh
+  # NOTE(robinson) - disabling ingest tests for now, as of 5/22/2024 they seem to have been
+  # broken for the past six months
+  # test_ingest:
+  #   strategy:
+  #     matrix:
+  #       python-version: ["3.9","3.10"]
+  #   runs-on: ubuntu-latest
+  #   env:
+  #     NLTK_DATA: ${{ github.workspace }}/nltk_data
+  #   needs: lint
+  #   steps:
+  #   - name: Checkout unstructured repo for integration testing
+  #     uses: actions/checkout@v4
+  #     with:
+  #       repository: 'Unstructured-IO/unstructured'
+  #   - name: Checkout this repo
+  #     uses: actions/checkout@v4
+  #     with:
+  #       path: inference
+  #   - name: Set up Python ${{ matrix.python-version }}
+  #     uses: actions/setup-python@v4
+  #     with:
+  #       python-version: ${{ matrix.python-version }}
+  #   - name: Test
+  #     env:
+  #       GH_READ_ONLY_ACCESS_TOKEN: ${{ secrets.GH_READ_ONLY_ACCESS_TOKEN }}
+  #       SLACK_TOKEN: ${{ secrets.SLACK_TOKEN }}
+  #       DISCORD_TOKEN: ${{ secrets.DISCORD_TOKEN }}
+  #     run: |
+  #       python${{ matrix.python-version }} -m venv .venv
+  #       source .venv/bin/activate
+  #       [ ! -d "$NLTK_DATA" ] && mkdir "$NLTK_DATA"
+  #       make install-ci
+  #       pip install -e inference/
+  #       sudo apt-get update
+  #       sudo apt-get install -y libmagic-dev poppler-utils libreoffice pandoc
+  #       sudo add-apt-repository -y ppa:alex-p/tesseract-ocr5
+  #       sudo apt-get install -y tesseract-ocr
+  #       sudo apt-get install -y tesseract-ocr-kor
+  #       sudo apt-get install -y diffstat
+  #       tesseract --version
+  #       make install-all-ingest
+  #       # only run ingest tests that check expected output diffs.
+  #       bash inference/scripts/test-unstructured-ingest-helper.sh
 
   changelog:
     runs-on: ubuntu-latest

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,7 +1,85 @@
-## 0.7.20
+## 0.7.37-dev0
 
 * refactor: remove layout analysis related code
 
+## 0.7.36
+
+fix: add input parameter validation to `fill_cells()` when converting cells to html
+
+## 0.7.35
+
+Fix syntax for generated HTML tables
+
+## 0.7.34
+
+* Reduce excessive logging
+
+## 0.7.33
+
+* BREAKING CHANGE: removes legacy detectron2 model
+* deps: remove layoutparser optional dependencies
+
+## 0.7.32
+
+* refactor: remove all code related to filling inferred elements text from embedded text (pdfminer).
+* bug: set the Chipper max_length variable
+
+## 0.7.31
+
+* refactor: remove all `cid` related code that was originally added to filter out invalid `pdfminer` text
+* enhancement: Wrapped hf_hub_download with a function that checks for local file before checking HF
+
+## 0.7.30
+
+* fix: table transformer doesn't return multiple cells with same coordinates
+*
+## 0.7.29
+
+* fix: table transformer predictions are now removed if confidence is below threshold
+
+
+## 0.7.28
+
+* feat: allow table transformer agent to return table prediction in not parsed format
+
+## 0.7.27
+
+* fix: remove pin from `onnxruntime` dependency.
+
+## 0.7.26
+
+* feat: add a set of new `ElementType`s to extend future element types recognition
+* feat: allow registering of new models for inference using `unstructured_inference.models.base.register_new_model` function
+
+## 0.7.25
+
+* fix: replace `Rectangle.is_in()` with `Rectangle.is_almost_subregion_of()` when filling in an inferred element with embedded text
+* bug: check for None in Chipper bounding box reduction
+* chore: removes `install-detectron2` from the `Makefile`
+* fix: convert label_map keys read from os.environment `UNSTRUCTURED_DEFAULT_MODEL_INITIALIZE_PARAMS_JSON_PATH` to int type
+* feat: removes supergradients references
+
+## 0.7.24
+
+* fix: assign value to `text_as_html` element attribute only if `text` attribute contains HTML tags.
+
+## 0.7.23
+
+* fix: added handling in `UnstructuredTableTransformerModel` for if `recognize` returns an empty
+  list in `run_prediction`.
+
+## 0.7.22
+
+* fix: add logic to handle computation of intersections betwen 2 `Rectangle`s when a `Rectangle` has `None` value in its coordinates
+
+## 0.7.21
+
+* fix: fix a bug where chipper, or any element extraction model based `PageLayout` object, lack `image_metadata` and other attributes that are required for downstream processing; this fix also reduces the memory overhead of using chipper model
+
+## 0.7.20
+
+* chipper-v3: improved table prediction
+
 ## 0.7.19
 
 * refactor: remove all OCR related code

diff --git a/Makefile b/Makefile
@@ -19,7 +19,7 @@ install-base: install-base-pip-packages
 
 ## install:                 installs all test, dev, and experimental requirements
 .PHONY: install
-install: install-base-pip-packages install-dev install-detectron2
+install: install-base-pip-packages install-dev
 
 .PHONY: install-ci
 install-ci: install-base-pip-packages install-test
@@ -28,10 +28,6 @@ install-ci: install-base-pip-packages install-test
 install-base-pip-packages:
 	python3 -m pip install pip==${PIP_VERSION}
 
-.PHONY: install-detectron2
-install-detectron2:
-	pip install "detectron2@git+https://github.com/facebookresearch/detectron2.git@57bdb21249d5418c130d54e2ebdc94dda7a4c01a"
-
 .PHONY: install-test
 install-test: install-base
 	pip install -r requirements/test.txt
@@ -44,10 +40,6 @@ install-dev: install-test
 .PHONY: pip-compile
 pip-compile:
 	pip-compile --upgrade requirements/base.in
-	# NOTE(robinson) - We want the dependencies for detectron2 in the requirements.txt, but not
-	# the detectron2 repo itself. If detectron2 is in the requirements.txt file, an order of
-	# operations issue related to the torch library causes the install to fail
-	sed 's/^detectron2 @/# detectron2 @/g' requirements/base.txt
 	pip-compile --upgrade requirements/test.in
 	pip-compile --upgrade requirements/dev.in
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -0,0 +1,2 @@
+[tool.black]
+line-length = 100
diff --git a/requirements/base.in b/requirements/base.in
@@ -1,11 +1,13 @@
 -c constraints.in
-layoutparser[layoutmodels,tesseract]
+layoutparser
 python-multipart
 huggingface-hub
 opencv-python!=4.7.0.68
 onnx
-# NOTE(benjamin): Pinned because onnxruntime changed the way quantization is done, and we need to update our code to support it
-onnxruntime<1.16
+onnxruntime>=1.17.0
+matplotlib
+torch
+timm
 # NOTE(alan): Pinned because this is when the most recent module we import appeared
 transformers>=4.25.1
 rapidfuzz