Skip to content

Commit

Permalink
feat:fix ingest test errors
Browse files Browse the repository at this point in the history
  • Loading branch information
christinestraub committed Jan 3, 2025
1 parent 03a0adf commit 9c42660
Show file tree
Hide file tree
Showing 5 changed files with 28 additions and 4 deletions.
1 change: 1 addition & 0 deletions .github/actions/base-cache/action.yml
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ runs:
python -m pip install --upgrade setuptools
fi
make install-ci
make install-nltk-models
- name: Save Cache
if: steps.virtualenv-cache-restore.outputs.cache-hit != 'true'
id: virtualenv-cache-save
Expand Down
1 change: 1 addition & 0 deletions .github/actions/base-ingest-cache/action.yml
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ runs:
python -m pip install --upgrade setuptools
fi
make install-ci
make install-nltk-models
make install-all-docs
make install-ingest
- name: Save Ingest Cache
Expand Down
2 changes: 2 additions & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -174,6 +174,7 @@ jobs:
run: |
source .venv/bin/activate
make install-ci
make install-nltk-models
make test-no-extras CI=true
test_unit_dependency_extras:
Expand Down Expand Up @@ -360,6 +361,7 @@ jobs:
sudo apt-get update
sudo apt-get install -y tesseract-ocr tesseract-ocr-kor
tesseract --version
make install-nltk-models
make test-unstructured-api-unit
changelog:
Expand Down
12 changes: 8 additions & 4 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -15,17 +15,17 @@ help: Makefile

## install-base: installs core requirements needed for text processing bricks
.PHONY: install-base
install-base: install-base-pip-packages
install-base: install-base-pip-packages install-nltk-models

## install: installs all test, dev, and experimental requirements
.PHONY: install
install: install-base-pip-packages install-dev install-test install-huggingface install-all-docs
install: install-base-pip-packages install-dev install-nltk-models install-test install-huggingface install-all-docs

.PHONY: install-ci
install-ci: install-base-pip-packages install-huggingface install-all-docs install-test install-pandoc
install-ci: install-base-pip-packages install-nltk-models install-huggingface install-all-docs install-test install-pandoc

.PHONY: install-base-ci
install-base-ci: install-base-pip-packages install-test install-pandoc
install-base-ci: install-base-pip-packages install-nltk-models install-test install-pandoc

.PHONY: install-base-pip-packages
install-base-pip-packages:
Expand All @@ -37,6 +37,10 @@ install-huggingface:
${PYTHON} -m pip install pip==${PIP_VERSION}
${PYTHON} -m pip install -r requirements/huggingface.txt

.PHONY: install-nltk-models
install-nltk-models:
${PYTHON} -c "from unstructured.nlp.tokenize import copy_nltk_packages; copy_nltk_packages()"

.PHONY: install-test
install-test:
${PYTHON} -m pip install -r requirements/test.txt
Expand Down
16 changes: 16 additions & 0 deletions unstructured/nlp/tokenize.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from __future__ import annotations

import shutil
import os
from functools import lru_cache
from typing import Final, List, Tuple
Expand All @@ -15,6 +16,21 @@
NLTK_DATA_PATH = os.getenv("NLTK_DATA", "/home/notebook-user/nltk_data")
nltk.data.path.append(NLTK_DATA_PATH)

def copy_nltk_packages():
local_path = "../../nltk_data"
if os.path.exists(local_path):
if not os.path.exists(NLTK_DATA_PATH):
os.makedirs(NLTK_DATA_PATH)
for item in os.listdir(local_path):
s = os.path.join(local_path, item)
d = os.path.join(NLTK_DATA_PATH, item)
if os.path.isdir(s):
shutil.copytree(s, d, dirs_exist_ok=True)
else:
shutil.copy2(s, d)
print(f"NLTK data copied to {NLTK_DATA_PATH}")
else:
print(f"Local NLTK data path does not exist: {local_path}")

def check_for_nltk_package(package_name: str, package_category: str) -> bool:
"""Checks to see if the specified NLTK package exists on the file system."""
Expand Down

0 comments on commit 9c42660

Please sign in to comment.