diff --git a/.github/actions/base-cache/action.yml b/.github/actions/base-cache/action.yml index 1c53deeb5e..05b1ddc227 100644 --- a/.github/actions/base-cache/action.yml +++ b/.github/actions/base-cache/action.yml @@ -40,6 +40,7 @@ runs: python -m pip install --upgrade setuptools fi make install-ci + make install-nltk-models - name: Save Cache if: steps.virtualenv-cache-restore.outputs.cache-hit != 'true' id: virtualenv-cache-save diff --git a/.github/actions/base-ingest-cache/action.yml b/.github/actions/base-ingest-cache/action.yml index 27b8676be8..dc9d5105a2 100644 --- a/.github/actions/base-ingest-cache/action.yml +++ b/.github/actions/base-ingest-cache/action.yml @@ -39,6 +39,7 @@ runs: python -m pip install --upgrade setuptools fi make install-ci + make install-nltk-models make install-all-docs make install-ingest - name: Save Ingest Cache diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index b4178406c9..3002dbb898 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -174,6 +174,7 @@ jobs: run: | source .venv/bin/activate make install-ci + make install-nltk-models make test-no-extras CI=true test_unit_dependency_extras: @@ -360,6 +361,7 @@ jobs: sudo apt-get update sudo apt-get install -y tesseract-ocr tesseract-ocr-kor tesseract --version + make install-nltk-models make test-unstructured-api-unit changelog: diff --git a/Makefile b/Makefile index d4bf50f6bd..5bda0044a1 100644 --- a/Makefile +++ b/Makefile @@ -15,17 +15,17 @@ help: Makefile ## install-base: installs core requirements needed for text processing bricks .PHONY: install-base -install-base: install-base-pip-packages +install-base: install-base-pip-packages install-nltk-models ## install: installs all test, dev, and experimental requirements .PHONY: install -install: install-base-pip-packages install-dev install-test install-huggingface install-all-docs +install: install-base-pip-packages install-dev install-nltk-models install-test install-huggingface install-all-docs .PHONY: install-ci -install-ci: install-base-pip-packages install-huggingface install-all-docs install-test install-pandoc +install-ci: install-base-pip-packages install-nltk-models install-huggingface install-all-docs install-test install-pandoc .PHONY: install-base-ci -install-base-ci: install-base-pip-packages install-test install-pandoc +install-base-ci: install-base-pip-packages install-nltk-models install-test install-pandoc .PHONY: install-base-pip-packages install-base-pip-packages: @@ -37,6 +37,10 @@ install-huggingface: ${PYTHON} -m pip install pip==${PIP_VERSION} ${PYTHON} -m pip install -r requirements/huggingface.txt +.PHONY: install-nltk-models +install-nltk-models: + ${PYTHON} -c "from unstructured.nlp.tokenize import copy_nltk_packages; copy_nltk_packages()" + .PHONY: install-test install-test: ${PYTHON} -m pip install -r requirements/test.txt diff --git a/unstructured/nlp/tokenize.py b/unstructured/nlp/tokenize.py index 116dd13996..44ca9cd4d3 100644 --- a/unstructured/nlp/tokenize.py +++ b/unstructured/nlp/tokenize.py @@ -1,5 +1,6 @@ from __future__ import annotations +import shutil import os from functools import lru_cache from typing import Final, List, Tuple @@ -15,6 +16,21 @@ NLTK_DATA_PATH = os.getenv("NLTK_DATA", "/home/notebook-user/nltk_data") nltk.data.path.append(NLTK_DATA_PATH) +def copy_nltk_packages(): + local_path = "../../nltk_data" + if os.path.exists(local_path): + if not os.path.exists(NLTK_DATA_PATH): + os.makedirs(NLTK_DATA_PATH) + for item in os.listdir(local_path): + s = os.path.join(local_path, item) + d = os.path.join(NLTK_DATA_PATH, item) + if os.path.isdir(s): + shutil.copytree(s, d, dirs_exist_ok=True) + else: + shutil.copy2(s, d) + print(f"NLTK data copied to {NLTK_DATA_PATH}") + else: + print(f"Local NLTK data path does not exist: {local_path}") def check_for_nltk_package(package_name: str, package_category: str) -> bool: """Checks to see if the specified NLTK package exists on the file system."""