Skip to content

Commit

Permalink
Merge branch 'refs/heads/main' into refactor/remove-analysis-scripts
Browse files Browse the repository at this point in the history
# Conflicts:
#	CHANGELOG.md
#	test_unstructured_inference/test_elements.py
#	unstructured_inference/__version__.py
#	unstructured_inference/utils.py
  • Loading branch information
christinestraub committed Jun 25, 2024
2 parents c3228be + 45b3be0 commit 6d0f23d
Show file tree
Hide file tree
Showing 34 changed files with 1,492 additions and 996 deletions.
90 changes: 46 additions & 44 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ on:
branches: [ main ]

env:
PYTHON_VERSION: 3.8
PYTHON_VERSION: 3.9

jobs:
setup:
Expand All @@ -22,7 +22,7 @@ jobs:
key: ${{ runner.os }}-${{ env.PYTHON_VERSION }}-${{ hashFiles('requirements/*.txt') }}
lookup-only: true
- name: Set up Python ${{ env.PYTHON_VERSION }}
uses: actions/setup-python@v4
uses: actions/setup-python@v5
with:
python-version: ${{ env.PYTHON_VERSION }}
- name: Install Poppler
Expand Down Expand Up @@ -104,48 +104,50 @@ jobs:
CI=true make test
make check-coverage
test_ingest:
strategy:
matrix:
python-version: ["3.8","3.9","3.10"]
runs-on: ubuntu-latest
env:
NLTK_DATA: ${{ github.workspace }}/nltk_data
needs: lint
steps:
- name: Checkout unstructured repo for integration testing
uses: actions/checkout@v4
with:
repository: 'Unstructured-IO/unstructured'
- name: Checkout this repo
uses: actions/checkout@v4
with:
path: inference
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v4
with:
python-version: ${{ matrix.python-version }}
- name: Test
env:
GH_READ_ONLY_ACCESS_TOKEN: ${{ secrets.GH_READ_ONLY_ACCESS_TOKEN }}
SLACK_TOKEN: ${{ secrets.SLACK_TOKEN }}
DISCORD_TOKEN: ${{ secrets.DISCORD_TOKEN }}
run: |
python${{ matrix.python-version }} -m venv .venv
source .venv/bin/activate
[ ! -d "$NLTK_DATA" ] && mkdir "$NLTK_DATA"
make install-ci
pip install -e inference/
sudo apt-get update
sudo apt-get install -y libmagic-dev poppler-utils libreoffice pandoc
sudo add-apt-repository -y ppa:alex-p/tesseract-ocr5
sudo apt-get install -y tesseract-ocr
sudo apt-get install -y tesseract-ocr-kor
sudo apt-get install -y diffstat
tesseract --version
make install-all-ingest
# only run ingest tests that check expected output diffs.
bash inference/scripts/test-unstructured-ingest-helper.sh
# NOTE(robinson) - disabling ingest tests for now, as of 5/22/2024 they seem to have been
# broken for the past six months
# test_ingest:
# strategy:
# matrix:
# python-version: ["3.9","3.10"]
# runs-on: ubuntu-latest
# env:
# NLTK_DATA: ${{ github.workspace }}/nltk_data
# needs: lint
# steps:
# - name: Checkout unstructured repo for integration testing
# uses: actions/checkout@v4
# with:
# repository: 'Unstructured-IO/unstructured'
# - name: Checkout this repo
# uses: actions/checkout@v4
# with:
# path: inference
# - name: Set up Python ${{ matrix.python-version }}
# uses: actions/setup-python@v4
# with:
# python-version: ${{ matrix.python-version }}
# - name: Test
# env:
# GH_READ_ONLY_ACCESS_TOKEN: ${{ secrets.GH_READ_ONLY_ACCESS_TOKEN }}
# SLACK_TOKEN: ${{ secrets.SLACK_TOKEN }}
# DISCORD_TOKEN: ${{ secrets.DISCORD_TOKEN }}
# run: |
# python${{ matrix.python-version }} -m venv .venv
# source .venv/bin/activate
# [ ! -d "$NLTK_DATA" ] && mkdir "$NLTK_DATA"
# make install-ci
# pip install -e inference/
# sudo apt-get update
# sudo apt-get install -y libmagic-dev poppler-utils libreoffice pandoc
# sudo add-apt-repository -y ppa:alex-p/tesseract-ocr5
# sudo apt-get install -y tesseract-ocr
# sudo apt-get install -y tesseract-ocr-kor
# sudo apt-get install -y diffstat
# tesseract --version
# make install-all-ingest
# # only run ingest tests that check expected output diffs.
# bash inference/scripts/test-unstructured-ingest-helper.sh

changelog:
runs-on: ubuntu-latest
Expand Down
80 changes: 79 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,7 +1,85 @@
## 0.7.20
## 0.7.37-dev0

* refactor: remove layout analysis related code

## 0.7.36

fix: add input parameter validation to `fill_cells()` when converting cells to html

## 0.7.35

Fix syntax for generated HTML tables

## 0.7.34

* Reduce excessive logging

## 0.7.33

* BREAKING CHANGE: removes legacy detectron2 model
* deps: remove layoutparser optional dependencies

## 0.7.32

* refactor: remove all code related to filling inferred elements text from embedded text (pdfminer).
* bug: set the Chipper max_length variable

## 0.7.31

* refactor: remove all `cid` related code that was originally added to filter out invalid `pdfminer` text
* enhancement: Wrapped hf_hub_download with a function that checks for local file before checking HF

## 0.7.30

* fix: table transformer doesn't return multiple cells with same coordinates
*
## 0.7.29

* fix: table transformer predictions are now removed if confidence is below threshold


## 0.7.28

* feat: allow table transformer agent to return table prediction in not parsed format

## 0.7.27

* fix: remove pin from `onnxruntime` dependency.

## 0.7.26

* feat: add a set of new `ElementType`s to extend future element types recognition
* feat: allow registering of new models for inference using `unstructured_inference.models.base.register_new_model` function

## 0.7.25

* fix: replace `Rectangle.is_in()` with `Rectangle.is_almost_subregion_of()` when filling in an inferred element with embedded text
* bug: check for None in Chipper bounding box reduction
* chore: removes `install-detectron2` from the `Makefile`
* fix: convert label_map keys read from os.environment `UNSTRUCTURED_DEFAULT_MODEL_INITIALIZE_PARAMS_JSON_PATH` to int type
* feat: removes supergradients references

## 0.7.24

* fix: assign value to `text_as_html` element attribute only if `text` attribute contains HTML tags.

## 0.7.23

* fix: added handling in `UnstructuredTableTransformerModel` for if `recognize` returns an empty
list in `run_prediction`.

## 0.7.22

* fix: add logic to handle computation of intersections betwen 2 `Rectangle`s when a `Rectangle` has `None` value in its coordinates

## 0.7.21

* fix: fix a bug where chipper, or any element extraction model based `PageLayout` object, lack `image_metadata` and other attributes that are required for downstream processing; this fix also reduces the memory overhead of using chipper model

## 0.7.20

* chipper-v3: improved table prediction

## 0.7.19

* refactor: remove all OCR related code
Expand Down
10 changes: 1 addition & 9 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ install-base: install-base-pip-packages

## install: installs all test, dev, and experimental requirements
.PHONY: install
install: install-base-pip-packages install-dev install-detectron2
install: install-base-pip-packages install-dev

.PHONY: install-ci
install-ci: install-base-pip-packages install-test
Expand All @@ -28,10 +28,6 @@ install-ci: install-base-pip-packages install-test
install-base-pip-packages:
python3 -m pip install pip==${PIP_VERSION}

.PHONY: install-detectron2
install-detectron2:
pip install "detectron2@git+https://github.com/facebookresearch/detectron2.git@57bdb21249d5418c130d54e2ebdc94dda7a4c01a"

.PHONY: install-test
install-test: install-base
pip install -r requirements/test.txt
Expand All @@ -44,10 +40,6 @@ install-dev: install-test
.PHONY: pip-compile
pip-compile:
pip-compile --upgrade requirements/base.in
# NOTE(robinson) - We want the dependencies for detectron2 in the requirements.txt, but not
# the detectron2 repo itself. If detectron2 is in the requirements.txt file, an order of
# operations issue related to the torch library causes the install to fail
sed 's/^detectron2 @/# detectron2 @/g' requirements/base.txt
pip-compile --upgrade requirements/test.in
pip-compile --upgrade requirements/dev.in

Expand Down
2 changes: 2 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
[tool.black]
line-length = 100
8 changes: 5 additions & 3 deletions requirements/base.in
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
-c constraints.in
layoutparser[layoutmodels,tesseract]
layoutparser
python-multipart
huggingface-hub
opencv-python!=4.7.0.68
onnx
# NOTE(benjamin): Pinned because onnxruntime changed the way quantization is done, and we need to update our code to support it
onnxruntime<1.16
onnxruntime>=1.17.0
matplotlib
torch
timm
# NOTE(alan): Pinned because this is when the most recent module we import appeared
transformers>=4.25.1
rapidfuzz
Loading

0 comments on commit 6d0f23d

Please sign in to comment.