From c82bad1061fcea527f8f094975c079db692dd930 Mon Sep 17 00:00:00 2001 From: qued <64741807+qued@users.noreply.github.com> Date: Wed, 24 May 2023 17:29:35 -0500 Subject: [PATCH] build(deps): avoid version conflicts (#636) Addresses #631. * Uses constraints to keep dependency versions more consistent. * Moves all dependencies to .in files which are then ingested by setup.py. * Adds script to check consistency of all extras. * Adds consistency check to CI. I should note that while it shouldn't be possible to cause a conflict between base.txt and any of the extras (because base.txt constrains all the extras) it is possible to get a conflict between two of the extras files. There are ways of trying to avoid that (like constraining each file by all the files that have already been processed before it in the order given in the make pip-compile target) but the ones I could think of seemed a little overwrought, and come with problems of their own. If a conflict arises, it should be flagged by CI or locally with make check-deps. When/if that happens, you can resolve the conflict by adding appropriate global constraints in requirements/constraints.txt. Also note that if fileA.in is constrained by fileB.txt, then fileB.in should be compiled before fileA.in in the make pip-compile target. Otherwise fileA.in will be compiled with the old version of fileB.txt which can cause conflicts or keep dependencies from being updated properly. --- .github/workflows/ci.yml | 30 ++++ CHANGELOG.md | 3 +- MANIFEST.in | 12 ++ Makefile | 31 ++-- docs/requirements.txt | 6 +- requirements/base.in | 16 ++ requirements/base.txt | 57 ++++--- requirements/build.txt | 6 +- requirements/cache.txt | 2 +- requirements/constraints.in | 15 ++ requirements/dev.in | 6 +- requirements/dev.txt | 63 +++++--- requirements/huggingface.in | 7 + requirements/huggingface.txt | 148 ++++------------- requirements/ingest-azure.in | 4 + requirements/ingest-azure.txt | 228 ++++----------------------- requirements/ingest-discord.in | 3 + requirements/ingest-discord.txt | 206 +----------------------- requirements/ingest-github.in | 5 + requirements/ingest-github.txt | 207 +++--------------------- requirements/ingest-gitlab.in | 3 + requirements/ingest-gitlab.txt | 204 ++---------------------- requirements/ingest-google-drive.in | 3 + requirements/ingest-google-drive.txt | 206 ++---------------------- requirements/ingest-reddit.in | 3 + requirements/ingest-reddit.txt | 204 ++---------------------- requirements/ingest-s3.in | 4 + requirements/ingest-s3.txt | 206 ++---------------------- requirements/ingest-slack.in | 3 + requirements/ingest-slack.txt | 205 +----------------------- requirements/ingest-wikipedia.in | 3 + requirements/ingest-wikipedia.txt | 202 ++---------------------- requirements/local-inference.in | 3 + requirements/local-inference.txt | 169 ++++++++------------ requirements/test.in | 6 +- requirements/test.txt | 62 +++++--- scripts/consistent-deps.sh | 45 ++++++ setup.py | 77 ++++----- unstructured/__version__.py | 2 +- 39 files changed, 557 insertions(+), 2108 deletions(-) create mode 100644 MANIFEST.in create mode 100644 requirements/base.in create mode 100644 requirements/constraints.in create mode 100644 requirements/huggingface.in create mode 100644 requirements/ingest-azure.in create mode 100644 requirements/ingest-discord.in create mode 100644 requirements/ingest-github.in create mode 100644 requirements/ingest-gitlab.in create mode 100644 requirements/ingest-google-drive.in create mode 100644 requirements/ingest-reddit.in create mode 100644 requirements/ingest-s3.in create mode 100644 requirements/ingest-slack.in create mode 100644 requirements/ingest-wikipedia.in create mode 100644 requirements/local-inference.in create mode 100755 scripts/consistent-deps.sh diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 609ded7d8e..fa7b2932f3 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -38,6 +38,36 @@ jobs: source .venv/bin/activate make install-ci + check-deps: + strategy: + matrix: + python-version: ["3.8","3.9","3.10"] + runs-on: ubuntu-latest + needs: setup + steps: + - uses: actions/checkout@v3 + - uses: actions/cache@v3 + id: virtualenv-cache + with: + path: .venv + key: unstructured-${{ runner.os }}-${{ matrix.python-version }}-${{ hashFiles('requirements/*.txt') }} + # NOTE(robinson) - This is a fallback in case the lint job does not find the cache. + # We can take this out when we implement the fix in CORE-99 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v4 + with: + python-version: ${{ matrix.python-version }} + - name: Setup virtual environment (no cache hit) + if: steps.virtualenv-cache.outputs.cache-hit != 'true' + run: | + python${{ matrix.python-version }} -m venv .venv + source .venv/bin/activate + make install-base-pip-packages + - name: Check for dependency conflicts + run: | + source .venv/bin/activate + make check-deps + lint: strategy: matrix: diff --git a/CHANGELOG.md b/CHANGELOG.md index 0fa7cf54b7..0ef7fd2152 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,8 +1,9 @@ -## 0.6.9-dev2 +## 0.6.9 ### Enhancements * fast strategy for pdf now keeps element bounding box data +* setup.py refactor ### Features diff --git a/MANIFEST.in b/MANIFEST.in new file mode 100644 index 0000000000..9e1b6a71f3 --- /dev/null +++ b/MANIFEST.in @@ -0,0 +1,12 @@ +include requirements/base.in +include requirements/huggingface.in +include requirements/local-inference.in +include requirements/ingest-s3.in +include requirements/ingest-azure.in +include requirements/ingest-discord.in +include requirements/ingest-github.in +include requirements/ingest-gitlab.in +include requirements/ingest-reddit.in +include requirements/ingest-slack.in +include requirements/ingest-wikipedia.in +include requirements/ingest-google-drive.in diff --git a/Makefile b/Makefile index 3c8c4c81e1..f057129bee 100644 --- a/Makefile +++ b/Makefile @@ -108,28 +108,28 @@ install-local-inference: install install-unstructured-inference install-detectro ## pip-compile: compiles all base/dev/test requirements .PHONY: pip-compile pip-compile: - pip-compile --upgrade -o requirements/base.txt + pip-compile --upgrade requirements/base.in # Extra requirements for huggingface staging functions - pip-compile --upgrade --extra huggingface -o requirements/huggingface.txt + pip-compile --upgrade requirements/huggingface.in # NOTE(robinson) - We want the dependencies for detectron2 in the requirements.txt, but not # the detectron2 repo itself. If detectron2 is in the requirements.txt file, an order of # operations issue related to the torch library causes the install to fail - pip-compile --upgrade requirements/dev.in pip-compile --upgrade requirements/test.in + pip-compile --upgrade requirements/dev.in pip-compile --upgrade requirements/build.in - pip-compile --upgrade --extra local-inference -o requirements/local-inference.txt + pip-compile --upgrade requirements/local-inference.in # NOTE(robinson) - doc/requirements.txt is where the GitHub action for building # sphinx docs looks for additional requirements cp requirements/build.txt docs/requirements.txt - pip-compile --upgrade --extra=s3 --output-file=requirements/ingest-s3.txt requirements/base.txt setup.py - pip-compile --upgrade --extra=azure --output-file=requirements/ingest-azure.txt requirements/base.txt setup.py - pip-compile --upgrade --extra=discord --output-file=requirements/ingest-azure.txt requirements/base.txt setup.py - pip-compile --upgrade --extra=reddit --output-file=requirements/ingest-reddit.txt requirements/base.txt setup.py - pip-compile --upgrade --extra=github --output-file=requirements/ingest-github.txt requirements/base.txt setup.py - pip-compile --upgrade --extra=gitlab --output-file=requirements/ingest-gitlab.txt requirements/base.txt setup.py - pip-compile --upgrade --extra=slack --output-file=requirements/ingest-slack.txt requirements/base.txt setup.py - pip-compile --upgrade --extra=wikipedia --output-file=requirements/ingest-wikipedia.txt requirements/base.txt setup.py - pip-compile --upgrade --extra=google-drive --output-file=requirements/ingest-google-drive.txt requirements/base.txt setup.py + pip-compile --upgrade requirements/ingest-s3.in + pip-compile --upgrade requirements/ingest-azure.in + pip-compile --upgrade requirements/ingest-discord.in + pip-compile --upgrade requirements/ingest-reddit.in + pip-compile --upgrade requirements/ingest-github.in + pip-compile --upgrade requirements/ingest-gitlab.in + pip-compile --upgrade requirements/ingest-slack.in + pip-compile --upgrade requirements/ingest-wikipedia.in + pip-compile --upgrade requirements/ingest-google-drive.in ## install-project-local: install unstructured into your local python environment .PHONY: install-project-local @@ -198,6 +198,11 @@ version-sync: check-coverage: coverage report --fail-under=95 +## check-deps: check consistency of dependencies +.PHONY: check-deps +check-deps: + scripts/consistent-deps.sh + ########## # Docker # ########## diff --git a/docs/requirements.txt b/docs/requirements.txt index f9aad7b7bb..e9c1f77138 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -10,7 +10,7 @@ babel==2.12.1 # via sphinx beautifulsoup4==4.12.2 # via furo -certifi==2022.12.7 +certifi==2023.5.7 # via # -r requirements/build.in # requests @@ -20,7 +20,7 @@ docutils==0.18.1 # via # sphinx # sphinx-rtd-theme -furo==2023.3.27 +furo==2023.5.20 # via -r requirements/build.in idna==3.4 # via requests @@ -40,7 +40,7 @@ pygments==2.15.1 # sphinx pytz==2023.3 # via babel -requests==2.30.0 +requests==2.31.0 # via sphinx snowballstemmer==2.2.0 # via sphinx diff --git a/requirements/base.in b/requirements/base.in new file mode 100644 index 0000000000..57a55833f4 --- /dev/null +++ b/requirements/base.in @@ -0,0 +1,16 @@ +-c "constraints.in" +argilla +chardet +lxml +msg_parser +nltk +openpyxl +pandas +pdfminer.six +pillow +pypandoc +python-docx +python-pptx +python-magic +markdown +requests diff --git a/requirements/base.txt b/requirements/base.txt index 6c22fa2bee..5303a6f65d 100644 --- a/requirements/base.txt +++ b/requirements/base.txt @@ -2,20 +2,20 @@ # This file is autogenerated by pip-compile with Python 3.8 # by the following command: # -# pip-compile --output-file=requirements/base.txt +# pip-compile requirements/base.in # anyio==3.6.2 # via httpcore -argilla==1.6.0 - # via unstructured (setup.py) +argilla==1.7.0 + # via -r requirements/base.in backoff==2.2.1 # via argilla -certifi==2022.12.7 +certifi==2023.5.7 # via + # -c requirements/constraints.in # httpcore # httpx # requests - # unstructured (setup.py) cffi==1.15.1 # via cryptography chardet==5.1.0 @@ -25,7 +25,9 @@ charset-normalizer==3.1.0 # pdfminer-six # requests click==8.1.3 - # via nltk + # via + # nltk + # typer commonmark==0.9.1 # via rich cryptography==40.0.2 @@ -51,17 +53,17 @@ joblib==1.2.0 # via nltk lxml==4.9.2 # via + # -r requirements/base.in # python-docx # python-pptx - # unstructured (setup.py) markdown==3.4.3 - # via unstructured (setup.py) + # via -r requirements/base.in monotonic==1.6 # via argilla msg-parser==1.2.0 - # via unstructured (setup.py) + # via -r requirements/base.in nltk==3.8.1 - # via unstructured (setup.py) + # via -r requirements/base.in numpy==1.23.5 # via # argilla @@ -69,41 +71,41 @@ numpy==1.23.5 olefile==0.46 # via msg-parser openpyxl==3.1.2 - # via unstructured (setup.py) + # via -r requirements/base.in packaging==23.1 # via argilla pandas==1.5.3 # via + # -r requirements/base.in # argilla - # unstructured (setup.py) pdfminer-six==20221105 - # via unstructured (setup.py) + # via -r requirements/base.in pillow==9.5.0 # via + # -r requirements/base.in # python-pptx - # unstructured (setup.py) pycparser==2.21 # via cffi -pydantic==1.10.7 +pydantic==1.10.8 # via argilla pygments==2.15.1 # via rich pypandoc==1.11 - # via unstructured (setup.py) + # via -r requirements/base.in python-dateutil==2.8.2 # via pandas python-docx==0.8.11 - # via unstructured (setup.py) + # via -r requirements/base.in python-magic==0.4.27 - # via unstructured (setup.py) + # via -r requirements/base.in python-pptx==0.6.21 - # via unstructured (setup.py) + # via -r requirements/base.in pytz==2023.3 # via pandas regex==2023.5.5 # via nltk -requests==2.30.0 - # via unstructured (setup.py) +requests==2.31.0 + # via -r requirements/base.in rfc3986[idna2008]==1.5.0 # via httpx rich==13.0.1 @@ -119,17 +121,22 @@ tqdm==4.65.0 # via # argilla # nltk -typing-extensions==4.5.0 +typer==0.9.0 + # via argilla +typing-extensions==4.6.0 # via # pydantic # rich -urllib3==2.0.2 - # via requests + # typer +urllib3==1.26.16 + # via + # -c requirements/constraints.in + # requests wrapt==1.14.1 # via # argilla # deprecated -xlsxwriter==3.1.0 +xlsxwriter==3.1.1 # via python-pptx zipp==3.15.0 # via importlib-metadata diff --git a/requirements/build.txt b/requirements/build.txt index f9aad7b7bb..e9c1f77138 100644 --- a/requirements/build.txt +++ b/requirements/build.txt @@ -10,7 +10,7 @@ babel==2.12.1 # via sphinx beautifulsoup4==4.12.2 # via furo -certifi==2022.12.7 +certifi==2023.5.7 # via # -r requirements/build.in # requests @@ -20,7 +20,7 @@ docutils==0.18.1 # via # sphinx # sphinx-rtd-theme -furo==2023.3.27 +furo==2023.5.20 # via -r requirements/build.in idna==3.4 # via requests @@ -40,7 +40,7 @@ pygments==2.15.1 # sphinx pytz==2023.3 # via babel -requests==2.30.0 +requests==2.31.0 # via sphinx snowballstemmer==2.2.0 # via sphinx diff --git a/requirements/cache.txt b/requirements/cache.txt index 2e65efe2a1..d229daaecc 100644 --- a/requirements/cache.txt +++ b/requirements/cache.txt @@ -1 +1 @@ -a \ No newline at end of file +# a \ No newline at end of file diff --git a/requirements/constraints.in b/requirements/constraints.in new file mode 100644 index 0000000000..52c939b3ee --- /dev/null +++ b/requirements/constraints.in @@ -0,0 +1,15 @@ +#################################################################################################### +# This file can house global constraints that aren't *direct* requirements of the package or any +# extras. Putting a dependency here will only affect dependency sets that contain them -- in other +# words, if something does not require a constraint, it will not be installed. +#################################################################################################### +# NOTE(alan): Pinning to avoid conflicts with downstream ingest-s3 +urllib3<1.27, >=1.25.4 +# consistency with local-inference-pin +protobuf<3.21 +# NOTE(robinson) - Required pins for security scans +jupyter-core>=4.11.2 +wheel>=0.38.1 +# NOTE(robinson) - The following pins are to address +# vulnerabilities in dependency scans +certifi>=2022.12.07 diff --git a/requirements/dev.in b/requirements/dev.in index f2a7ebf0fb..342ef0ae40 100644 --- a/requirements/dev.in +++ b/requirements/dev.in @@ -1,7 +1,7 @@ +-c constraints.in +-c base.txt +-c test.txt jupyter ipython pip-tools pre-commit -# NOTE(robinson) - Required pins for security scans -jupyter-core>=4.11.2 -wheel>=0.38.1 diff --git a/requirements/dev.txt b/requirements/dev.txt index 9ee3a424c0..a9d5cd2b23 100644 --- a/requirements/dev.txt +++ b/requirements/dev.txt @@ -5,7 +5,9 @@ # pip-compile requirements/dev.in # anyio==3.6.2 - # via jupyter-server + # via + # -c requirements/base.txt + # jupyter-server appnope==0.1.3 # via # ipykernel @@ -32,13 +34,18 @@ bleach==6.0.0 build==0.10.0 # via pip-tools cffi==1.15.1 - # via argon2-cffi-bindings + # via + # -c requirements/base.txt + # argon2-cffi-bindings cfgv==3.3.1 # via pre-commit chardet==5.1.0 # via -r requirements/dev.in click==8.1.3 - # via pip-tools + # via + # -c requirements/base.txt + # -c requirements/test.txt + # pip-tools comm==0.1.3 # via ipykernel debugpy==1.6.7 @@ -51,7 +58,7 @@ distlib==0.3.6 # via virtualenv executing==1.2.0 # via stack-data -fastjsonschema==2.16.3 +fastjsonschema==2.17.1 # via nbformat filelock==3.12.0 # via virtualenv @@ -61,15 +68,18 @@ identify==2.5.24 # via pre-commit idna==3.4 # via + # -c requirements/base.txt + # -c requirements/test.txt # anyio # jsonschema importlib-metadata==6.6.0 # via + # -c requirements/base.txt # jupyter-client # nbconvert importlib-resources==5.12.0 # via jsonschema -ipykernel==6.22.0 +ipykernel==6.23.1 # via # ipywidgets # jupyter @@ -121,7 +131,7 @@ jupyter-console==6.6.3 # via jupyter jupyter-core==5.3.0 # via - # -r requirements/dev.in + # -c requirements/constraints.in # ipykernel # jupyter-client # jupyter-console @@ -156,9 +166,9 @@ mistune==2.0.5 # via nbconvert nbclassic==1.0.0 # via notebook -nbclient==0.7.4 +nbclient==0.8.0 # via nbconvert -nbconvert==7.3.1 +nbconvert==7.4.0 # via # jupyter # jupyter-server @@ -176,7 +186,7 @@ nest-asyncio==1.5.6 # ipykernel # nbclassic # notebook -nodeenv==1.7.0 +nodeenv==1.8.0 # via pre-commit notebook==6.5.4 # via jupyter @@ -184,6 +194,8 @@ notebook-shim==0.2.3 # via nbclassic packaging==23.1 # via + # -c requirements/base.txt + # -c requirements/test.txt # build # ipykernel # jupyter-server @@ -202,11 +214,12 @@ pip-tools==6.13.0 # via -r requirements/dev.in pkgutil-resolve-name==1.3.10 # via jsonschema -platformdirs==3.5.0 +platformdirs==3.5.1 # via + # -c requirements/test.txt # jupyter-core # virtualenv -pre-commit==3.3.1 +pre-commit==3.3.2 # via -r requirements/dev.in prometheus-client==0.16.0 # via @@ -226,9 +239,12 @@ ptyprocess==0.7.0 pure-eval==0.2.2 # via stack-data pycparser==2.21 - # via cffi + # via + # -c requirements/base.txt + # cffi pygments==2.15.1 # via + # -c requirements/base.txt # ipython # jupyter-console # nbconvert @@ -239,12 +255,14 @@ pyrsistent==0.19.3 # via jsonschema python-dateutil==2.8.2 # via + # -c requirements/base.txt # arrow # jupyter-client python-json-logger==2.0.7 # via jupyter-events pyyaml==6.0 # via + # -c requirements/test.txt # jupyter-events # pre-commit pyzmq==25.0.2 @@ -275,12 +293,16 @@ send2trash==1.8.2 # notebook six==1.16.0 # via + # -c requirements/base.txt + # -c requirements/test.txt # asttokens # bleach # python-dateutil # rfc3339-validator sniffio==1.3.0 - # via anyio + # via + # -c requirements/base.txt + # anyio soupsieve==2.4.1 # via beautifulsoup4 stack-data==0.6.2 @@ -295,9 +317,10 @@ tinycss2==1.2.1 # via nbconvert tomli==2.0.1 # via + # -c requirements/test.txt # build # pyproject-hooks -tornado==6.3.1 +tornado==6.3.2 # via # ipykernel # jupyter-client @@ -323,8 +346,11 @@ traitlets==5.9.0 # nbformat # notebook # qtconsole -typing-extensions==4.5.0 - # via ipython +typing-extensions==4.6.0 + # via + # -c requirements/base.txt + # -c requirements/test.txt + # ipython uri-template==1.2.0 # via jsonschema virtualenv==20.23.0 @@ -337,16 +363,17 @@ webencodings==0.5.1 # via # bleach # tinycss2 -websocket-client==1.5.1 +websocket-client==1.5.2 # via jupyter-server wheel==0.40.0 # via - # -r requirements/dev.in + # -c requirements/constraints.in # pip-tools widgetsnbextension==4.0.7 # via ipywidgets zipp==3.15.0 # via + # -c requirements/base.txt # importlib-metadata # importlib-resources diff --git a/requirements/huggingface.in b/requirements/huggingface.in new file mode 100644 index 0000000000..8cd6dc9e58 --- /dev/null +++ b/requirements/huggingface.in @@ -0,0 +1,7 @@ +-c constraints.in +-c base.txt +langdetect +sacremoses +sentencepiece +torch +transformers diff --git a/requirements/huggingface.txt b/requirements/huggingface.txt index 0b4e81e6e5..4d40cd4cb2 100644 --- a/requirements/huggingface.txt +++ b/requirements/huggingface.txt @@ -2,189 +2,101 @@ # This file is autogenerated by pip-compile with Python 3.8 # by the following command: # -# pip-compile --extra=huggingface --output-file=requirements/huggingface.txt +# pip-compile requirements/huggingface.in # -anyio==3.6.2 - # via httpcore -argilla==1.6.0 - # via unstructured (setup.py) -backoff==2.2.1 - # via argilla -certifi==2022.12.7 +certifi==2023.5.7 # via - # httpcore - # httpx + # -c requirements/base.txt + # -c requirements/constraints.in # requests - # unstructured (setup.py) -cffi==1.15.1 - # via cryptography charset-normalizer==3.1.0 # via - # pdfminer-six + # -c requirements/base.txt # requests click==8.1.3 # via - # nltk + # -c requirements/base.txt # sacremoses -commonmark==0.9.1 - # via rich -cryptography==40.0.2 - # via pdfminer-six -deprecated==1.2.13 - # via argilla -et-xmlfile==1.1.0 - # via openpyxl filelock==3.12.0 # via # huggingface-hub # torch # transformers -fsspec==2023.4.0 +fsspec==2023.5.0 # via huggingface-hub -h11==0.14.0 - # via httpcore -httpcore==0.16.3 - # via httpx -httpx==0.23.3 - # via argilla huggingface-hub==0.14.1 # via transformers idna==3.4 # via - # anyio + # -c requirements/base.txt # requests - # rfc3986 -importlib-metadata==6.6.0 - # via markdown jinja2==3.1.2 # via torch joblib==1.2.0 # via - # nltk + # -c requirements/base.txt # sacremoses langdetect==1.0.9 - # via unstructured (setup.py) -lxml==4.9.2 - # via - # python-docx - # python-pptx - # unstructured (setup.py) -markdown==3.4.3 - # via unstructured (setup.py) + # via -r requirements/huggingface.in markupsafe==2.1.2 # via jinja2 -monotonic==1.6 - # via argilla mpmath==1.3.0 # via sympy -msg-parser==1.2.0 - # via unstructured (setup.py) networkx==3.1 # via torch -nltk==3.8.1 - # via unstructured (setup.py) numpy==1.23.5 # via - # argilla - # pandas + # -c requirements/base.txt # transformers -olefile==0.46 - # via msg-parser -openpyxl==3.1.2 - # via unstructured (setup.py) packaging==23.1 # via - # argilla + # -c requirements/base.txt # huggingface-hub # transformers -pandas==1.5.3 - # via - # argilla - # unstructured (setup.py) -pdfminer-six==20221105 - # via unstructured (setup.py) -pillow==9.5.0 - # via - # python-pptx - # unstructured (setup.py) -pycparser==2.21 - # via cffi -pydantic==1.10.7 - # via argilla -pygments==2.15.1 - # via rich -pypandoc==1.11 - # via unstructured (setup.py) -python-dateutil==2.8.2 - # via pandas -python-docx==0.8.11 - # via unstructured (setup.py) -python-magic==0.4.27 - # via unstructured (setup.py) -python-pptx==0.6.21 - # via unstructured (setup.py) -pytz==2023.3 - # via pandas pyyaml==6.0 # via # huggingface-hub # transformers regex==2023.5.5 # via - # nltk + # -c requirements/base.txt # sacremoses # transformers -requests==2.30.0 +requests==2.31.0 # via + # -c requirements/base.txt # huggingface-hub # transformers - # unstructured (setup.py) -rfc3986[idna2008]==1.5.0 - # via httpx -rich==13.0.1 - # via argilla sacremoses==0.0.53 - # via unstructured (setup.py) + # via -r requirements/huggingface.in sentencepiece==0.1.99 - # via unstructured (setup.py) + # via -r requirements/huggingface.in six==1.16.0 # via + # -c requirements/base.txt # langdetect - # python-dateutil # sacremoses -sniffio==1.3.0 - # via - # anyio - # httpcore - # httpx -sympy==1.11.1 +sympy==1.12 # via torch tokenizers==0.13.3 # via transformers -torch==2.0.0 - # via unstructured (setup.py) +torch==2.0.1 + # via -r requirements/huggingface.in tqdm==4.65.0 # via - # argilla + # -c requirements/base.txt # huggingface-hub - # nltk # sacremoses # transformers -transformers==4.28.1 - # via unstructured (setup.py) -typing-extensions==4.5.0 +transformers==4.29.2 + # via -r requirements/huggingface.in +typing-extensions==4.6.0 # via + # -c requirements/base.txt # huggingface-hub - # pydantic - # rich # torch -urllib3==2.0.2 - # via requests -wrapt==1.14.1 +urllib3==1.26.16 # via - # argilla - # deprecated -xlsxwriter==3.1.0 - # via python-pptx -zipp==3.15.0 - # via importlib-metadata + # -c requirements/base.txt + # -c requirements/constraints.in + # requests diff --git a/requirements/ingest-azure.in b/requirements/ingest-azure.in new file mode 100644 index 0000000000..d42acf96a4 --- /dev/null +++ b/requirements/ingest-azure.in @@ -0,0 +1,4 @@ +-c constraints.in +-c base.txt +adlfs +fsspec diff --git a/requirements/ingest-azure.txt b/requirements/ingest-azure.txt index 373a4a13f2..34e9d34003 100644 --- a/requirements/ingest-azure.txt +++ b/requirements/ingest-azure.txt @@ -2,24 +2,14 @@ # This file is autogenerated by pip-compile with Python 3.8 # by the following command: # -# pip-compile --extra=azure --output-file=requirements/ingest-azure.txt requirements/base.txt setup.py +# pip-compile requirements/ingest-azure.in # -adal==1.2.7 - # via azure-datalake-store -adlfs==2023.1.0 - # via unstructured (setup.py) +adlfs==2023.4.0 + # via -r requirements/ingest-azure.in aiohttp==3.8.4 # via adlfs aiosignal==1.3.1 # via aiohttp -anyio==3.6.2 - # via - # -r requirements/base.txt - # httpcore -argilla==1.6.0 - # via - # -r requirements/base.txt - # unstructured (setup.py) async-timeout==4.0.2 # via aiohttp attrs==23.1.0 @@ -29,253 +19,89 @@ azure-core==1.26.4 # adlfs # azure-identity # azure-storage-blob -azure-datalake-store==0.0.52 +azure-datalake-store==0.0.53 # via adlfs -azure-identity==1.12.0 +azure-identity==1.13.0 # via adlfs azure-storage-blob==12.16.0 # via adlfs -backoff==2.2.1 - # via - # -r requirements/base.txt - # argilla -certifi==2022.12.7 +certifi==2023.5.7 # via - # -r requirements/base.txt - # httpcore - # httpx + # -c requirements/base.txt + # -c requirements/constraints.in # requests - # unstructured (setup.py) cffi==1.15.1 # via + # -c requirements/base.txt # azure-datalake-store # cryptography charset-normalizer==3.1.0 # via - # -r requirements/base.txt + # -c requirements/base.txt # aiohttp # requests -click==8.1.3 - # via - # -r requirements/base.txt - # nltk -commonmark==0.9.1 - # via - # -r requirements/base.txt - # rich cryptography==40.0.2 # via - # adal + # -c requirements/base.txt # azure-identity # azure-storage-blob # msal # pyjwt -deprecated==1.2.13 - # via - # -r requirements/base.txt - # argilla -et-xmlfile==1.1.0 - # via - # -r requirements/base.txt - # openpyxl frozenlist==1.3.3 # via # aiohttp # aiosignal -fsspec==2023.4.0 +fsspec==2023.5.0 # via + # -r requirements/ingest-azure.in # adlfs - # unstructured (setup.py) -h11==0.14.0 - # via - # -r requirements/base.txt - # httpcore -httpcore==0.16.3 - # via - # -r requirements/base.txt - # httpx -httpx==0.23.3 - # via - # -r requirements/base.txt - # argilla idna==3.4 # via - # -r requirements/base.txt - # anyio + # -c requirements/base.txt # requests - # rfc3986 # yarl -importlib-metadata==6.6.0 - # via - # -r requirements/base.txt - # markdown isodate==0.6.1 # via azure-storage-blob -joblib==1.2.0 - # via - # -r requirements/base.txt - # nltk -lxml==4.9.2 - # via - # -r requirements/base.txt - # python-docx - # python-pptx - # unstructured (setup.py) -markdown==3.4.3 - # via - # -r requirements/base.txt - # unstructured (setup.py) -monotonic==1.6 - # via - # -r requirements/base.txt - # argilla msal==1.22.0 # via + # azure-datalake-store # azure-identity # msal-extensions msal-extensions==1.0.0 # via azure-identity -msg-parser==1.2.0 - # via - # -r requirements/base.txt - # unstructured (setup.py) multidict==6.0.4 # via # aiohttp # yarl -nltk==3.8.1 - # via - # -r requirements/base.txt - # unstructured (setup.py) -numpy==1.23.5 - # via - # -r requirements/base.txt - # argilla - # pandas -olefile==0.46 - # via - # -r requirements/base.txt - # msg-parser -openpyxl==3.1.2 - # via - # -r requirements/base.txt - # unstructured (setup.py) -packaging==23.1 - # via - # -r requirements/base.txt - # argilla -pandas==1.5.3 - # via - # -r requirements/base.txt - # argilla - # unstructured (setup.py) -pillow==9.5.0 - # via - # -r requirements/base.txt - # python-pptx - # unstructured (setup.py) portalocker==2.7.0 # via msal-extensions pycparser==2.21 - # via cffi -pydantic==1.10.7 - # via - # -r requirements/base.txt - # argilla -pygments==2.15.1 - # via - # -r requirements/base.txt - # rich -pyjwt[crypto]==2.6.0 - # via - # adal - # msal -pypandoc==1.11 - # via - # -r requirements/base.txt - # unstructured (setup.py) -python-dateutil==2.8.2 - # via - # -r requirements/base.txt - # adal - # pandas -python-docx==0.8.11 - # via - # -r requirements/base.txt - # unstructured (setup.py) -python-magic==0.4.27 # via - # -r requirements/base.txt - # unstructured (setup.py) -python-pptx==0.6.21 + # -c requirements/base.txt + # cffi +pyjwt[crypto]==2.7.0 + # via msal +requests==2.31.0 # via - # -r requirements/base.txt - # unstructured (setup.py) -pytz==2023.3 - # via - # -r requirements/base.txt - # pandas -regex==2023.3.23 - # via - # -r requirements/base.txt - # nltk -requests==2.28.2 - # via - # -r requirements/base.txt - # adal + # -c requirements/base.txt # azure-core # azure-datalake-store # msal - # unstructured (setup.py) -rfc3986[idna2008]==1.5.0 - # via - # -r requirements/base.txt - # httpx -rich==13.0.1 - # via - # -r requirements/base.txt - # argilla six==1.16.0 # via - # -r requirements/base.txt + # -c requirements/base.txt # azure-core # azure-identity # isodate - # python-dateutil -sniffio==1.3.0 +typing-extensions==4.6.0 # via - # -r requirements/base.txt - # anyio - # httpcore - # httpx -tqdm==4.65.0 - # via - # -r requirements/base.txt - # argilla - # nltk -typing-extensions==4.5.0 - # via - # -r requirements/base.txt + # -c requirements/base.txt # azure-core # azure-storage-blob - # pydantic - # rich -urllib3==1.26.15 +urllib3==1.26.16 # via - # -r requirements/base.txt + # -c requirements/base.txt + # -c requirements/constraints.in # requests -wrapt==1.14.1 - # via - # -r requirements/base.txt - # argilla - # deprecated -xlsxwriter==3.1.0 - # via - # -r requirements/base.txt - # python-pptx yarl==1.9.2 # via aiohttp -zipp==3.15.0 - # via - # -r requirements/base.txt - # importlib-metadata diff --git a/requirements/ingest-discord.in b/requirements/ingest-discord.in new file mode 100644 index 0000000000..b001dc30bf --- /dev/null +++ b/requirements/ingest-discord.in @@ -0,0 +1,3 @@ +-c constraints.in +-c base.txt +discord-py diff --git a/requirements/ingest-discord.txt b/requirements/ingest-discord.txt index cc63d24cdb..d353d6df38 100644 --- a/requirements/ingest-discord.txt +++ b/requirements/ingest-discord.txt @@ -2,227 +2,33 @@ # This file is autogenerated by pip-compile with Python 3.8 # by the following command: # -# pip-compile --extra=discord --output-file=requirements/ingest-discord.txt requirements/base.txt setup.py +# pip-compile requirements/ingest-discord.in # - aiohttp==3.8.4 # via discord-py aiosignal==1.3.1 # via aiohttp -anyio==3.6.2 - # via - # -r requirements/base.txt - # httpcore -argilla==1.6.0 - # via - # -r requirements/base.txt - # unstructured (setup.py) async-timeout==4.0.2 # via aiohttp attrs==23.1.0 # via aiohttp -backoff==2.2.1 - # via - # -r requirements/base.txt - # argilla -certifi==2022.12.7 - # via - # -r requirements/base.txt - # httpcore - # httpx - # requests - # unstructured (setup.py) charset-normalizer==3.1.0 # via - # -r requirements/base.txt + # -c requirements/base.txt # aiohttp - # requests -click==8.1.3 - # via - # -r requirements/base.txt - # nltk -commonmark==0.9.1 - # via - # -r requirements/base.txt - # rich -deprecated==1.2.13 - # via - # -r requirements/base.txt - # argilla -discord-py==2.2.2 - # via unstructured (setup.py) -et-xmlfile==1.1.0 - # via - # -r requirements/base.txt - # openpyxl +discord-py==2.2.3 + # via -r requirements/ingest-discord.in frozenlist==1.3.3 # via # aiohttp # aiosignal -h11==0.14.0 - # via - # -r requirements/base.txt - # httpcore -httpcore==0.16.3 - # via - # -r requirements/base.txt - # httpx -httpx==0.23.3 - # via - # -r requirements/base.txt - # argilla idna==3.4 # via - # -r requirements/base.txt - # anyio - # requests - # rfc3986 + # -c requirements/base.txt # yarl -importlib-metadata==6.5.0 - # via - # -r requirements/base.txt - # markdown -joblib==1.2.0 - # via - # -r requirements/base.txt - # nltk -lxml==4.9.2 - # via - # -r requirements/base.txt - # python-docx - # python-pptx - # unstructured (setup.py) -markdown==3.4.3 - # via - # -r requirements/base.txt - # unstructured (setup.py) -monotonic==1.6 - # via - # -r requirements/base.txt - # argilla -msg-parser==1.2.0 - # via - # -r requirements/base.txt - # unstructured (setup.py) multidict==6.0.4 # via # aiohttp # yarl -nltk==3.8.1 - # via - # -r requirements/base.txt - # unstructured (setup.py) -numpy==1.23.5 - # via - # -r requirements/base.txt - # argilla - # pandas -olefile==0.46 - # via - # -r requirements/base.txt - # msg-parser -openpyxl==3.1.2 - # via - # -r requirements/base.txt - # unstructured (setup.py) -packaging==23.1 - # via - # -r requirements/base.txt - # argilla -pandas==1.5.3 - # via - # -r requirements/base.txt - # argilla - # unstructured (setup.py) -pillow==9.5.0 - # via - # -r requirements/base.txt - # python-pptx - # unstructured (setup.py) -pydantic==1.10.7 - # via - # -r requirements/base.txt - # argilla -pygments==2.15.1 - # via - # -r requirements/base.txt - # rich -pypandoc==1.11 - # via - # -r requirements/base.txt - # unstructured (setup.py) -python-dateutil==2.8.2 - # via - # -r requirements/base.txt - # pandas -python-docx==0.8.11 - # via - # -r requirements/base.txt - # unstructured (setup.py) -python-magic==0.4.27 - # via - # -r requirements/base.txt - # unstructured (setup.py) -python-pptx==0.6.21 - # via - # -r requirements/base.txt - # unstructured (setup.py) -pytz==2023.3 - # via - # -r requirements/base.txt - # pandas -regex==2023.3.23 - # via - # -r requirements/base.txt - # nltk -requests==2.28.2 - # via - # -r requirements/base.txt - # unstructured (setup.py) -rfc3986[idna2008]==1.5.0 - # via - # -r requirements/base.txt - # httpx -rich==13.0.1 - # via - # -r requirements/base.txt - # argilla -six==1.16.0 - # via - # -r requirements/base.txt - # python-dateutil -sniffio==1.3.0 - # via - # -r requirements/base.txt - # anyio - # httpcore - # httpx -tqdm==4.65.0 - # via - # -r requirements/base.txt - # argilla - # nltk -typing-extensions==4.5.0 - # via - # -r requirements/base.txt - # pydantic - # rich -urllib3==1.26.15 - # via - # -r requirements/base.txt - # requests -wrapt==1.14.1 - # via - # -r requirements/base.txt - # argilla - # deprecated -xlsxwriter==3.1.0 - # via - # -r requirements/base.txt - # python-pptx -yarl==1.9.1 +yarl==1.9.2 # via aiohttp -zipp==3.15.0 - # via - # -r requirements/base.txt - # importlib-metadata diff --git a/requirements/ingest-github.in b/requirements/ingest-github.in new file mode 100644 index 0000000000..60cfe5f56a --- /dev/null +++ b/requirements/ingest-github.in @@ -0,0 +1,5 @@ +-c constraints.in +-c base.txt +# NOTE - pygithub==1.58.0 fails due to https://github.com/PyGithub/PyGithub/issues/2436 +# In the future, we can update this to pygithub>1.58.0 +pygithub==1.57.0 diff --git a/requirements/ingest-github.txt b/requirements/ingest-github.txt index b155a5bef8..906fa614c1 100644 --- a/requirements/ingest-github.txt +++ b/requirements/ingest-github.txt @@ -2,216 +2,49 @@ # This file is autogenerated by pip-compile with Python 3.8 # by the following command: # -# pip-compile --extra=github --output-file=requirements/ingest-github.txt requirements/base.txt setup.py +# pip-compile requirements/ingest-github.in # -anyio==3.6.2 +certifi==2023.5.7 # via - # -r requirements/base.txt - # httpcore -argilla==1.6.0 - # via - # -r requirements/base.txt - # unstructured (setup.py) -backoff==2.2.1 - # via - # -r requirements/base.txt - # argilla -certifi==2022.12.7 - # via - # -r requirements/base.txt - # httpcore - # httpx + # -c requirements/base.txt + # -c requirements/constraints.in # requests - # unstructured (setup.py) cffi==1.15.1 - # via pynacl + # via + # -c requirements/base.txt + # pynacl charset-normalizer==3.1.0 # via - # -r requirements/base.txt + # -c requirements/base.txt # requests -click==8.1.3 - # via - # -r requirements/base.txt - # nltk -commonmark==0.9.1 - # via - # -r requirements/base.txt - # rich deprecated==1.2.13 # via - # -r requirements/base.txt - # argilla + # -c requirements/base.txt # pygithub -et-xmlfile==1.1.0 - # via - # -r requirements/base.txt - # openpyxl -h11==0.14.0 - # via - # -r requirements/base.txt - # httpcore -httpcore==0.16.3 - # via - # -r requirements/base.txt - # httpx -httpx==0.23.3 - # via - # -r requirements/base.txt - # argilla idna==3.4 # via - # -r requirements/base.txt - # anyio + # -c requirements/base.txt # requests - # rfc3986 -importlib-metadata==6.6.0 - # via - # -r requirements/base.txt - # markdown -joblib==1.2.0 - # via - # -r requirements/base.txt - # nltk -lxml==4.9.2 - # via - # -r requirements/base.txt - # python-docx - # python-pptx - # unstructured (setup.py) -markdown==3.4.3 - # via - # -r requirements/base.txt - # unstructured (setup.py) -monotonic==1.6 - # via - # -r requirements/base.txt - # argilla -msg-parser==1.2.0 - # via - # -r requirements/base.txt - # unstructured (setup.py) -nltk==3.8.1 - # via - # -r requirements/base.txt - # unstructured (setup.py) -numpy==1.23.5 - # via - # -r requirements/base.txt - # argilla - # pandas -olefile==0.46 - # via - # -r requirements/base.txt - # msg-parser -openpyxl==3.1.2 - # via - # -r requirements/base.txt - # unstructured (setup.py) -packaging==23.1 - # via - # -r requirements/base.txt - # argilla -pandas==1.5.3 - # via - # -r requirements/base.txt - # argilla - # unstructured (setup.py) -pillow==9.5.0 - # via - # -r requirements/base.txt - # python-pptx - # unstructured (setup.py) pycparser==2.21 - # via cffi -pydantic==1.10.7 # via - # -r requirements/base.txt - # argilla + # -c requirements/base.txt + # cffi pygithub==1.57.0 - # via unstructured (setup.py) -pygments==2.15.1 - # via - # -r requirements/base.txt - # rich -pyjwt==2.6.0 + # via -r requirements/ingest-github.in +pyjwt==2.7.0 # via pygithub pynacl==1.5.0 # via pygithub -pypandoc==1.11 - # via - # -r requirements/base.txt - # unstructured (setup.py) -python-dateutil==2.8.2 - # via - # -r requirements/base.txt - # pandas -python-docx==0.8.11 - # via - # -r requirements/base.txt - # unstructured (setup.py) -python-magic==0.4.27 - # via - # -r requirements/base.txt - # unstructured (setup.py) -python-pptx==0.6.21 - # via - # -r requirements/base.txt - # unstructured (setup.py) -pytz==2023.3 - # via - # -r requirements/base.txt - # pandas -regex==2023.3.23 +requests==2.31.0 # via - # -r requirements/base.txt - # nltk -requests==2.28.2 - # via - # -r requirements/base.txt + # -c requirements/base.txt # pygithub - # unstructured (setup.py) -rfc3986[idna2008]==1.5.0 - # via - # -r requirements/base.txt - # httpx -rich==13.0.1 - # via - # -r requirements/base.txt - # argilla -six==1.16.0 - # via - # -r requirements/base.txt - # python-dateutil -sniffio==1.3.0 - # via - # -r requirements/base.txt - # anyio - # httpcore - # httpx -tqdm==4.65.0 +urllib3==1.26.16 # via - # -r requirements/base.txt - # argilla - # nltk -typing-extensions==4.5.0 - # via - # -r requirements/base.txt - # pydantic - # rich -urllib3==1.26.15 - # via - # -r requirements/base.txt + # -c requirements/base.txt + # -c requirements/constraints.in # requests wrapt==1.14.1 # via - # -r requirements/base.txt - # argilla + # -c requirements/base.txt # deprecated -xlsxwriter==3.1.0 - # via - # -r requirements/base.txt - # python-pptx -zipp==3.15.0 - # via - # -r requirements/base.txt - # importlib-metadata diff --git a/requirements/ingest-gitlab.in b/requirements/ingest-gitlab.in new file mode 100644 index 0000000000..e4b90b53fa --- /dev/null +++ b/requirements/ingest-gitlab.in @@ -0,0 +1,3 @@ +-c constraints.in +-c base.txt +python-gitlab diff --git a/requirements/ingest-gitlab.txt b/requirements/ingest-gitlab.txt index 6496693124..fb0d82900f 100644 --- a/requirements/ingest-gitlab.txt +++ b/requirements/ingest-gitlab.txt @@ -2,210 +2,32 @@ # This file is autogenerated by pip-compile with Python 3.8 # by the following command: # -# pip-compile --extra=gitlab --output-file=requirements/ingest-gitlab.txt requirements/base.txt setup.py +# pip-compile requirements/ingest-gitlab.in # -anyio==3.6.2 +certifi==2023.5.7 # via - # -r requirements/base.txt - # httpcore -argilla==1.6.0 - # via - # -r requirements/base.txt - # unstructured (setup.py) -backoff==2.2.1 - # via - # -r requirements/base.txt - # argilla -certifi==2022.12.7 - # via - # -r requirements/base.txt - # httpcore - # httpx + # -c requirements/base.txt + # -c requirements/constraints.in # requests - # unstructured (setup.py) charset-normalizer==3.1.0 # via - # -r requirements/base.txt + # -c requirements/base.txt # requests -click==8.1.3 - # via - # -r requirements/base.txt - # nltk -commonmark==0.9.1 - # via - # -r requirements/base.txt - # rich -deprecated==1.2.13 - # via - # -r requirements/base.txt - # argilla -et-xmlfile==1.1.0 - # via - # -r requirements/base.txt - # openpyxl -h11==0.14.0 - # via - # -r requirements/base.txt - # httpcore -httpcore==0.16.3 - # via - # -r requirements/base.txt - # httpx -httpx==0.23.3 - # via - # -r requirements/base.txt - # argilla idna==3.4 # via - # -r requirements/base.txt - # anyio + # -c requirements/base.txt # requests - # rfc3986 -importlib-metadata==6.6.0 - # via - # -r requirements/base.txt - # markdown -joblib==1.2.0 - # via - # -r requirements/base.txt - # nltk -lxml==4.9.2 - # via - # -r requirements/base.txt - # python-docx - # python-pptx - # unstructured (setup.py) -markdown==3.4.3 - # via - # -r requirements/base.txt - # unstructured (setup.py) -monotonic==1.6 - # via - # -r requirements/base.txt - # argilla -msg-parser==1.2.0 - # via - # -r requirements/base.txt - # unstructured (setup.py) -nltk==3.8.1 - # via - # -r requirements/base.txt - # unstructured (setup.py) -numpy==1.23.5 - # via - # -r requirements/base.txt - # argilla - # pandas -olefile==0.46 - # via - # -r requirements/base.txt - # msg-parser -openpyxl==3.1.2 - # via - # -r requirements/base.txt - # unstructured (setup.py) -packaging==23.1 - # via - # -r requirements/base.txt - # argilla -pandas==1.5.3 - # via - # -r requirements/base.txt - # argilla - # unstructured (setup.py) -pillow==9.5.0 - # via - # -r requirements/base.txt - # python-pptx - # unstructured (setup.py) -pydantic==1.10.7 - # via - # -r requirements/base.txt - # argilla -pygments==2.15.1 - # via - # -r requirements/base.txt - # rich -pypandoc==1.11 - # via - # -r requirements/base.txt - # unstructured (setup.py) -python-dateutil==2.8.2 - # via - # -r requirements/base.txt - # pandas -python-docx==0.8.11 - # via - # -r requirements/base.txt - # unstructured (setup.py) python-gitlab==3.14.0 - # via unstructured (setup.py) -python-magic==0.4.27 - # via - # -r requirements/base.txt - # unstructured (setup.py) -python-pptx==0.6.21 - # via - # -r requirements/base.txt - # unstructured (setup.py) -pytz==2023.3 - # via - # -r requirements/base.txt - # pandas -regex==2023.3.23 + # via -r requirements/ingest-gitlab.in +requests==2.31.0 # via - # -r requirements/base.txt - # nltk -requests==2.28.2 - # via - # -r requirements/base.txt + # -c requirements/base.txt # python-gitlab # requests-toolbelt - # unstructured (setup.py) -requests-toolbelt==0.10.1 +requests-toolbelt==1.0.0 # via python-gitlab -rfc3986[idna2008]==1.5.0 - # via - # -r requirements/base.txt - # httpx -rich==13.0.1 - # via - # -r requirements/base.txt - # argilla -six==1.16.0 - # via - # -r requirements/base.txt - # python-dateutil -sniffio==1.3.0 - # via - # -r requirements/base.txt - # anyio - # httpcore - # httpx -tqdm==4.65.0 +urllib3==1.26.16 # via - # -r requirements/base.txt - # argilla - # nltk -typing-extensions==4.5.0 - # via - # -r requirements/base.txt - # pydantic - # rich -urllib3==1.26.15 - # via - # -r requirements/base.txt + # -c requirements/base.txt + # -c requirements/constraints.in # requests -wrapt==1.14.1 - # via - # -r requirements/base.txt - # argilla - # deprecated -xlsxwriter==3.1.0 - # via - # -r requirements/base.txt - # python-pptx -zipp==3.15.0 - # via - # -r requirements/base.txt - # importlib-metadata diff --git a/requirements/ingest-google-drive.in b/requirements/ingest-google-drive.in new file mode 100644 index 0000000000..1fa93a782a --- /dev/null +++ b/requirements/ingest-google-drive.in @@ -0,0 +1,3 @@ +-c constraints.in +-c base.txt +google-api-python-client diff --git a/requirements/ingest-google-drive.txt b/requirements/ingest-google-drive.txt index 3c73f48638..a3501e8281 100644 --- a/requirements/ingest-google-drive.txt +++ b/requirements/ingest-google-drive.txt @@ -2,54 +2,24 @@ # This file is autogenerated by pip-compile with Python 3.8 # by the following command: # -# pip-compile --extra=google-drive --output-file=requirements/ingest-google-drive.txt requirements/base.txt setup.py +# pip-compile requirements/ingest-google-drive.in # -anyio==3.6.2 - # via - # -r requirements/base.txt - # httpcore -argilla==1.6.0 - # via - # -r requirements/base.txt - # unstructured (setup.py) -backoff==2.2.1 - # via - # -r requirements/base.txt - # argilla cachetools==5.3.0 # via google-auth -certifi==2022.12.7 +certifi==2023.5.7 # via - # -r requirements/base.txt - # httpcore - # httpx + # -c requirements/base.txt + # -c requirements/constraints.in # requests - # unstructured (setup.py) charset-normalizer==3.1.0 # via - # -r requirements/base.txt + # -c requirements/base.txt # requests -click==8.1.3 - # via - # -r requirements/base.txt - # nltk -commonmark==0.9.1 - # via - # -r requirements/base.txt - # rich -deprecated==1.2.13 - # via - # -r requirements/base.txt - # argilla -et-xmlfile==1.1.0 - # via - # -r requirements/base.txt - # openpyxl google-api-core==2.11.0 # via google-api-python-client google-api-python-client==2.86.0 - # via unstructured (setup.py) -google-auth==2.17.3 + # via -r requirements/ingest-google-drive.in +google-auth==2.18.1 # via # google-api-core # google-api-python-client @@ -58,187 +28,43 @@ google-auth-httplib2==0.1.0 # via google-api-python-client googleapis-common-protos==1.59.0 # via google-api-core -h11==0.14.0 - # via - # -r requirements/base.txt - # httpcore -httpcore==0.16.3 - # via - # -r requirements/base.txt - # httpx httplib2==0.22.0 # via # google-api-python-client # google-auth-httplib2 -httpx==0.23.3 - # via - # -r requirements/base.txt - # argilla idna==3.4 # via - # -r requirements/base.txt - # anyio + # -c requirements/base.txt # requests - # rfc3986 -importlib-metadata==6.6.0 - # via - # -r requirements/base.txt - # markdown -joblib==1.2.0 - # via - # -r requirements/base.txt - # nltk -lxml==4.9.2 - # via - # -r requirements/base.txt - # python-docx - # python-pptx - # unstructured (setup.py) -markdown==3.4.3 - # via - # -r requirements/base.txt - # unstructured (setup.py) -monotonic==1.6 - # via - # -r requirements/base.txt - # argilla -msg-parser==1.2.0 - # via - # -r requirements/base.txt - # unstructured (setup.py) -nltk==3.8.1 - # via - # -r requirements/base.txt - # unstructured (setup.py) -numpy==1.23.5 - # via - # -r requirements/base.txt - # argilla - # pandas -olefile==0.46 - # via - # -r requirements/base.txt - # msg-parser -openpyxl==3.1.2 - # via - # -r requirements/base.txt - # unstructured (setup.py) -packaging==23.1 - # via - # -r requirements/base.txt - # argilla -pandas==1.5.3 - # via - # -r requirements/base.txt - # argilla - # unstructured (setup.py) -pillow==9.5.0 - # via - # -r requirements/base.txt - # python-pptx - # unstructured (setup.py) protobuf==3.20.3 # via + # -c requirements/constraints.in # google-api-core # googleapis-common-protos - # unstructured (setup.py) pyasn1==0.5.0 # via # pyasn1-modules # rsa pyasn1-modules==0.3.0 # via google-auth -pydantic==1.10.7 - # via - # -r requirements/base.txt - # argilla -pygments==2.15.1 - # via - # -r requirements/base.txt - # rich -pypandoc==1.11 - # via - # -r requirements/base.txt - # unstructured (setup.py) pyparsing==3.0.9 # via httplib2 -python-dateutil==2.8.2 - # via - # -r requirements/base.txt - # pandas -python-docx==0.8.11 - # via - # -r requirements/base.txt - # unstructured (setup.py) -python-magic==0.4.27 - # via - # -r requirements/base.txt - # unstructured (setup.py) -python-pptx==0.6.21 - # via - # -r requirements/base.txt - # unstructured (setup.py) -pytz==2023.3 - # via - # -r requirements/base.txt - # pandas -regex==2023.3.23 +requests==2.31.0 # via - # -r requirements/base.txt - # nltk -requests==2.28.2 - # via - # -r requirements/base.txt + # -c requirements/base.txt # google-api-core - # unstructured (setup.py) -rfc3986[idna2008]==1.5.0 - # via - # -r requirements/base.txt - # httpx -rich==13.0.1 - # via - # -r requirements/base.txt - # argilla rsa==4.9 # via google-auth six==1.16.0 # via - # -r requirements/base.txt + # -c requirements/base.txt # google-auth # google-auth-httplib2 - # python-dateutil -sniffio==1.3.0 - # via - # -r requirements/base.txt - # anyio - # httpcore - # httpx -tqdm==4.65.0 - # via - # -r requirements/base.txt - # argilla - # nltk -typing-extensions==4.5.0 - # via - # -r requirements/base.txt - # pydantic - # rich uritemplate==4.1.1 # via google-api-python-client -urllib3==1.26.15 +urllib3==1.26.16 # via - # -r requirements/base.txt + # -c requirements/base.txt + # -c requirements/constraints.in + # google-auth # requests -wrapt==1.14.1 - # via - # -r requirements/base.txt - # argilla - # deprecated -xlsxwriter==3.1.0 - # via - # -r requirements/base.txt - # python-pptx -zipp==3.15.0 - # via - # -r requirements/base.txt - # importlib-metadata diff --git a/requirements/ingest-reddit.in b/requirements/ingest-reddit.in new file mode 100644 index 0000000000..fbd91ad43d --- /dev/null +++ b/requirements/ingest-reddit.in @@ -0,0 +1,3 @@ +-c constraints.in +-c base.txt +praw diff --git a/requirements/ingest-reddit.txt b/requirements/ingest-reddit.txt index f4859ed8e7..7bbd35b5eb 100644 --- a/requirements/ingest-reddit.txt +++ b/requirements/ingest-reddit.txt @@ -2,214 +2,36 @@ # This file is autogenerated by pip-compile with Python 3.8 # by the following command: # -# pip-compile --extra=reddit --output-file=requirements/ingest-reddit.txt requirements/base.txt setup.py +# pip-compile requirements/ingest-reddit.in # -anyio==3.6.2 +certifi==2023.5.7 # via - # -r requirements/base.txt - # httpcore -argilla==1.6.0 - # via - # -r requirements/base.txt - # unstructured (setup.py) -backoff==2.2.1 - # via - # -r requirements/base.txt - # argilla -certifi==2022.12.7 - # via - # -r requirements/base.txt - # httpcore - # httpx + # -c requirements/base.txt + # -c requirements/constraints.in # requests - # unstructured (setup.py) charset-normalizer==3.1.0 # via - # -r requirements/base.txt + # -c requirements/base.txt # requests -click==8.1.3 - # via - # -r requirements/base.txt - # nltk -commonmark==0.9.1 - # via - # -r requirements/base.txt - # rich -deprecated==1.2.13 - # via - # -r requirements/base.txt - # argilla -et-xmlfile==1.1.0 - # via - # -r requirements/base.txt - # openpyxl -h11==0.14.0 - # via - # -r requirements/base.txt - # httpcore -httpcore==0.16.3 - # via - # -r requirements/base.txt - # httpx -httpx==0.23.3 - # via - # -r requirements/base.txt - # argilla idna==3.4 # via - # -r requirements/base.txt - # anyio + # -c requirements/base.txt # requests - # rfc3986 -importlib-metadata==6.6.0 - # via - # -r requirements/base.txt - # markdown -joblib==1.2.0 - # via - # -r requirements/base.txt - # nltk -lxml==4.9.2 - # via - # -r requirements/base.txt - # python-docx - # python-pptx - # unstructured (setup.py) -markdown==3.4.3 - # via - # -r requirements/base.txt - # unstructured (setup.py) -monotonic==1.6 - # via - # -r requirements/base.txt - # argilla -msg-parser==1.2.0 - # via - # -r requirements/base.txt - # unstructured (setup.py) -nltk==3.8.1 - # via - # -r requirements/base.txt - # unstructured (setup.py) -numpy==1.23.5 - # via - # -r requirements/base.txt - # argilla - # pandas -olefile==0.46 - # via - # -r requirements/base.txt - # msg-parser -openpyxl==3.1.2 - # via - # -r requirements/base.txt - # unstructured (setup.py) -packaging==23.1 - # via - # -r requirements/base.txt - # argilla -pandas==1.5.3 - # via - # -r requirements/base.txt - # argilla - # unstructured (setup.py) -pillow==9.5.0 - # via - # -r requirements/base.txt - # python-pptx - # unstructured (setup.py) praw==7.7.0 - # via unstructured (setup.py) + # via -r requirements/ingest-reddit.in prawcore==2.3.0 # via praw -pydantic==1.10.7 - # via - # -r requirements/base.txt - # argilla -pygments==2.15.1 - # via - # -r requirements/base.txt - # rich -pypandoc==1.11 - # via - # -r requirements/base.txt - # unstructured (setup.py) -python-dateutil==2.8.2 - # via - # -r requirements/base.txt - # pandas -python-docx==0.8.11 - # via - # -r requirements/base.txt - # unstructured (setup.py) -python-magic==0.4.27 - # via - # -r requirements/base.txt - # unstructured (setup.py) -python-pptx==0.6.21 - # via - # -r requirements/base.txt - # unstructured (setup.py) -pytz==2023.3 - # via - # -r requirements/base.txt - # pandas -regex==2023.3.23 +requests==2.31.0 # via - # -r requirements/base.txt - # nltk -requests==2.28.2 - # via - # -r requirements/base.txt + # -c requirements/base.txt # prawcore - # unstructured (setup.py) # update-checker -rfc3986[idna2008]==1.5.0 - # via - # -r requirements/base.txt - # httpx -rich==13.0.1 - # via - # -r requirements/base.txt - # argilla -six==1.16.0 - # via - # -r requirements/base.txt - # python-dateutil -sniffio==1.3.0 - # via - # -r requirements/base.txt - # anyio - # httpcore - # httpx -tqdm==4.65.0 - # via - # -r requirements/base.txt - # argilla - # nltk -typing-extensions==4.5.0 - # via - # -r requirements/base.txt - # pydantic - # rich update-checker==0.18.0 # via praw -urllib3==1.26.15 +urllib3==1.26.16 # via - # -r requirements/base.txt + # -c requirements/base.txt + # -c requirements/constraints.in # requests -websocket-client==1.5.1 +websocket-client==1.5.2 # via praw -wrapt==1.14.1 - # via - # -r requirements/base.txt - # argilla - # deprecated -xlsxwriter==3.1.0 - # via - # -r requirements/base.txt - # python-pptx -zipp==3.15.0 - # via - # -r requirements/base.txt - # importlib-metadata diff --git a/requirements/ingest-s3.in b/requirements/ingest-s3.in new file mode 100644 index 0000000000..c848714f96 --- /dev/null +++ b/requirements/ingest-s3.in @@ -0,0 +1,4 @@ +-c constraints.in +-c base.txt +s3fs +fsspec diff --git a/requirements/ingest-s3.txt b/requirements/ingest-s3.txt index 49a1d27ab0..cdbab18009 100644 --- a/requirements/ingest-s3.txt +++ b/requirements/ingest-s3.txt @@ -2,7 +2,7 @@ # This file is autogenerated by pip-compile with Python 3.8 # by the following command: # -# pip-compile --extra=s3 --output-file=requirements/ingest-s3.txt requirements/base.txt setup.py +# pip-compile requirements/ingest-s3.in # aiobotocore==2.5.0 # via s3fs @@ -14,232 +14,56 @@ aioitertools==0.11.0 # via aiobotocore aiosignal==1.3.1 # via aiohttp -anyio==3.6.2 - # via - # -r requirements/base.txt - # httpcore -argilla==1.6.0 - # via - # -r requirements/base.txt - # unstructured (setup.py) async-timeout==4.0.2 # via aiohttp attrs==23.1.0 # via aiohttp -backoff==2.2.1 - # via - # -r requirements/base.txt - # argilla botocore==1.29.76 # via aiobotocore -certifi==2022.12.7 - # via - # -r requirements/base.txt - # httpcore - # httpx - # requests - # unstructured (setup.py) charset-normalizer==3.1.0 # via - # -r requirements/base.txt + # -c requirements/base.txt # aiohttp - # requests -click==8.1.3 - # via - # -r requirements/base.txt - # nltk -commonmark==0.9.1 - # via - # -r requirements/base.txt - # rich -deprecated==1.2.13 - # via - # -r requirements/base.txt - # argilla -et-xmlfile==1.1.0 - # via - # -r requirements/base.txt - # openpyxl frozenlist==1.3.3 # via # aiohttp # aiosignal -fsspec==2023.4.0 +fsspec==2023.5.0 # via + # -r requirements/ingest-s3.in # s3fs - # unstructured (setup.py) -h11==0.14.0 - # via - # -r requirements/base.txt - # httpcore -httpcore==0.16.3 - # via - # -r requirements/base.txt - # httpx -httpx==0.23.3 - # via - # -r requirements/base.txt - # argilla idna==3.4 # via - # -r requirements/base.txt - # anyio - # requests - # rfc3986 + # -c requirements/base.txt # yarl -importlib-metadata==6.6.0 - # via - # -r requirements/base.txt - # markdown jmespath==1.0.1 # via botocore -joblib==1.2.0 - # via - # -r requirements/base.txt - # nltk -lxml==4.9.2 - # via - # -r requirements/base.txt - # python-docx - # python-pptx - # unstructured (setup.py) -markdown==3.4.3 - # via - # -r requirements/base.txt - # unstructured (setup.py) -monotonic==1.6 - # via - # -r requirements/base.txt - # argilla -msg-parser==1.2.0 - # via - # -r requirements/base.txt - # unstructured (setup.py) multidict==6.0.4 # via # aiohttp # yarl -nltk==3.8.1 - # via - # -r requirements/base.txt - # unstructured (setup.py) -numpy==1.23.5 - # via - # -r requirements/base.txt - # argilla - # pandas -olefile==0.46 - # via - # -r requirements/base.txt - # msg-parser -openpyxl==3.1.2 - # via - # -r requirements/base.txt - # unstructured (setup.py) -packaging==23.1 - # via - # -r requirements/base.txt - # argilla -pandas==1.5.3 - # via - # -r requirements/base.txt - # argilla - # unstructured (setup.py) -pillow==9.5.0 - # via - # -r requirements/base.txt - # python-pptx - # unstructured (setup.py) -pydantic==1.10.7 - # via - # -r requirements/base.txt - # argilla -pygments==2.15.1 - # via - # -r requirements/base.txt - # rich -pypandoc==1.11 - # via - # -r requirements/base.txt - # unstructured (setup.py) python-dateutil==2.8.2 # via - # -r requirements/base.txt + # -c requirements/base.txt # botocore - # pandas -python-docx==0.8.11 - # via - # -r requirements/base.txt - # unstructured (setup.py) -python-magic==0.4.27 - # via - # -r requirements/base.txt - # unstructured (setup.py) -python-pptx==0.6.21 - # via - # -r requirements/base.txt - # unstructured (setup.py) -pytz==2023.3 - # via - # -r requirements/base.txt - # pandas -regex==2023.3.23 - # via - # -r requirements/base.txt - # nltk -requests==2.28.2 - # via - # -r requirements/base.txt - # unstructured (setup.py) -rfc3986[idna2008]==1.5.0 - # via - # -r requirements/base.txt - # httpx -rich==13.0.1 - # via - # -r requirements/base.txt - # argilla -s3fs==2023.4.0 - # via unstructured (setup.py) +s3fs==2023.5.0 + # via -r requirements/ingest-s3.in six==1.16.0 # via - # -r requirements/base.txt + # -c requirements/base.txt # python-dateutil -sniffio==1.3.0 +typing-extensions==4.6.0 # via - # -r requirements/base.txt - # anyio - # httpcore - # httpx -tqdm==4.65.0 - # via - # -r requirements/base.txt - # argilla - # nltk -typing-extensions==4.5.0 - # via - # -r requirements/base.txt + # -c requirements/base.txt # aioitertools - # pydantic - # rich -urllib3==1.26.15 +urllib3==1.26.16 # via - # -r requirements/base.txt + # -c requirements/base.txt + # -c requirements/constraints.in # botocore - # requests wrapt==1.14.1 # via - # -r requirements/base.txt + # -c requirements/base.txt # aiobotocore - # argilla - # deprecated -xlsxwriter==3.1.0 - # via - # -r requirements/base.txt - # python-pptx yarl==1.9.2 # via aiohttp -zipp==3.15.0 - # via - # -r requirements/base.txt - # importlib-metadata diff --git a/requirements/ingest-slack.in b/requirements/ingest-slack.in new file mode 100644 index 0000000000..56decdf676 --- /dev/null +++ b/requirements/ingest-slack.in @@ -0,0 +1,3 @@ +-c constraints.in +-c base.txt +slack_sdk diff --git a/requirements/ingest-slack.txt b/requirements/ingest-slack.txt index 5a60c1c52a..b8c94147ba 100644 --- a/requirements/ingest-slack.txt +++ b/requirements/ingest-slack.txt @@ -2,206 +2,7 @@ # This file is autogenerated by pip-compile with Python 3.8 # by the following command: # -# pip-compile --extra=slack --output-file=requirements/ingest-slack.txt requirements/base.txt setup.py +# pip-compile requirements/ingest-slack.in # -anyio==3.6.2 - # via - # -r requirements/base.txt - # httpcore -argilla==1.6.0 - # via - # -r requirements/base.txt - # unstructured (setup.py) -backoff==2.2.1 - # via - # -r requirements/base.txt - # argilla -certifi==2022.12.7 - # via - # -r requirements/base.txt - # httpcore - # httpx - # requests - # unstructured (setup.py) -charset-normalizer==3.1.0 - # via - # -r requirements/base.txt - # requests -click==8.1.3 - # via - # -r requirements/base.txt - # nltk -commonmark==0.9.1 - # via - # -r requirements/base.txt - # rich -deprecated==1.2.13 - # via - # -r requirements/base.txt - # argilla -et-xmlfile==1.1.0 - # via - # -r requirements/base.txt - # openpyxl -h11==0.14.0 - # via - # -r requirements/base.txt - # httpcore -httpcore==0.16.3 - # via - # -r requirements/base.txt - # httpx -httpx==0.23.3 - # via - # -r requirements/base.txt - # argilla -idna==3.4 - # via - # -r requirements/base.txt - # anyio - # requests - # rfc3986 -importlib-metadata==6.6.0 - # via - # -r requirements/base.txt - # markdown -joblib==1.2.0 - # via - # -r requirements/base.txt - # nltk -lxml==4.9.2 - # via - # -r requirements/base.txt - # python-docx - # python-pptx - # unstructured (setup.py) -markdown==3.4.3 - # via - # -r requirements/base.txt - # unstructured (setup.py) -monotonic==1.6 - # via - # -r requirements/base.txt - # argilla -msg-parser==1.2.0 - # via - # -r requirements/base.txt - # unstructured (setup.py) -nltk==3.8.1 - # via - # -r requirements/base.txt - # unstructured (setup.py) -numpy==1.23.5 - # via - # -r requirements/base.txt - # argilla - # pandas -olefile==0.46 - # via - # -r requirements/base.txt - # msg-parser -openpyxl==3.1.2 - # via - # -r requirements/base.txt - # unstructured (setup.py) -packaging==23.1 - # via - # -r requirements/base.txt - # argilla -pandas==1.5.3 - # via - # -r requirements/base.txt - # argilla - # unstructured (setup.py) -pillow==9.5.0 - # via - # -r requirements/base.txt - # python-pptx - # unstructured (setup.py) -pydantic==1.10.7 - # via - # -r requirements/base.txt - # argilla -pygments==2.15.1 - # via - # -r requirements/base.txt - # rich -pypandoc==1.11 - # via - # -r requirements/base.txt - # unstructured (setup.py) -python-dateutil==2.8.2 - # via - # -r requirements/base.txt - # pandas -python-docx==0.8.11 - # via - # -r requirements/base.txt - # unstructured (setup.py) -python-magic==0.4.27 - # via - # -r requirements/base.txt - # unstructured (setup.py) -python-pptx==0.6.21 - # via - # -r requirements/base.txt - # unstructured (setup.py) -pytz==2023.3 - # via - # -r requirements/base.txt - # pandas -regex==2023.3.23 - # via - # -r requirements/base.txt - # nltk -requests==2.28.2 - # via - # -r requirements/base.txt - # unstructured (setup.py) -rfc3986[idna2008]==1.5.0 - # via - # -r requirements/base.txt - # httpx -rich==13.0.1 - # via - # -r requirements/base.txt - # argilla -six==1.16.0 - # via - # -r requirements/base.txt - # python-dateutil -slack-sdk==3.21.2 - # via unstructured (setup.py) -sniffio==1.3.0 - # via - # -r requirements/base.txt - # anyio - # httpcore - # httpx -tqdm==4.65.0 - # via - # -r requirements/base.txt - # argilla - # nltk -typing-extensions==4.5.0 - # via - # -r requirements/base.txt - # pydantic - # rich -urllib3==1.26.15 - # via - # -r requirements/base.txt - # requests -wrapt==1.14.1 - # via - # -r requirements/base.txt - # argilla - # deprecated -xlsxwriter==3.1.0 - # via - # -r requirements/base.txt - # python-pptx -zipp==3.15.0 - # via - # -r requirements/base.txt - # importlib-metadata +slack-sdk==3.21.3 + # via -r requirements/ingest-slack.in diff --git a/requirements/ingest-wikipedia.in b/requirements/ingest-wikipedia.in new file mode 100644 index 0000000000..5bad52243c --- /dev/null +++ b/requirements/ingest-wikipedia.in @@ -0,0 +1,3 @@ +-c constraints.in +-c base.txt +wikipedia diff --git a/requirements/ingest-wikipedia.txt b/requirements/ingest-wikipedia.txt index f289c0a94d..53fcd38512 100644 --- a/requirements/ingest-wikipedia.txt +++ b/requirements/ingest-wikipedia.txt @@ -2,211 +2,33 @@ # This file is autogenerated by pip-compile with Python 3.8 # by the following command: # -# pip-compile --extra=wikipedia --output-file=requirements/ingest-wikipedia.txt requirements/base.txt setup.py +# pip-compile requirements/ingest-wikipedia.in # -anyio==3.6.2 - # via - # -r requirements/base.txt - # httpcore -argilla==1.6.0 - # via - # -r requirements/base.txt - # unstructured (setup.py) -backoff==2.2.1 - # via - # -r requirements/base.txt - # argilla beautifulsoup4==4.12.2 # via wikipedia -certifi==2022.12.7 +certifi==2023.5.7 # via - # -r requirements/base.txt - # httpcore - # httpx + # -c requirements/base.txt + # -c requirements/constraints.in # requests - # unstructured (setup.py) charset-normalizer==3.1.0 # via - # -r requirements/base.txt + # -c requirements/base.txt # requests -click==8.1.3 - # via - # -r requirements/base.txt - # nltk -commonmark==0.9.1 - # via - # -r requirements/base.txt - # rich -deprecated==1.2.13 - # via - # -r requirements/base.txt - # argilla -et-xmlfile==1.1.0 - # via - # -r requirements/base.txt - # openpyxl -h11==0.14.0 - # via - # -r requirements/base.txt - # httpcore -httpcore==0.16.3 - # via - # -r requirements/base.txt - # httpx -httpx==0.23.3 - # via - # -r requirements/base.txt - # argilla idna==3.4 # via - # -r requirements/base.txt - # anyio + # -c requirements/base.txt # requests - # rfc3986 -importlib-metadata==6.6.0 - # via - # -r requirements/base.txt - # markdown -joblib==1.2.0 - # via - # -r requirements/base.txt - # nltk -lxml==4.9.2 - # via - # -r requirements/base.txt - # python-docx - # python-pptx - # unstructured (setup.py) -markdown==3.4.3 - # via - # -r requirements/base.txt - # unstructured (setup.py) -monotonic==1.6 - # via - # -r requirements/base.txt - # argilla -msg-parser==1.2.0 - # via - # -r requirements/base.txt - # unstructured (setup.py) -nltk==3.8.1 - # via - # -r requirements/base.txt - # unstructured (setup.py) -numpy==1.23.5 - # via - # -r requirements/base.txt - # argilla - # pandas -olefile==0.46 - # via - # -r requirements/base.txt - # msg-parser -openpyxl==3.1.2 - # via - # -r requirements/base.txt - # unstructured (setup.py) -packaging==23.1 - # via - # -r requirements/base.txt - # argilla -pandas==1.5.3 +requests==2.31.0 # via - # -r requirements/base.txt - # argilla - # unstructured (setup.py) -pillow==9.5.0 - # via - # -r requirements/base.txt - # python-pptx - # unstructured (setup.py) -pydantic==1.10.7 - # via - # -r requirements/base.txt - # argilla -pygments==2.15.1 - # via - # -r requirements/base.txt - # rich -pypandoc==1.11 - # via - # -r requirements/base.txt - # unstructured (setup.py) -python-dateutil==2.8.2 - # via - # -r requirements/base.txt - # pandas -python-docx==0.8.11 - # via - # -r requirements/base.txt - # unstructured (setup.py) -python-magic==0.4.27 - # via - # -r requirements/base.txt - # unstructured (setup.py) -python-pptx==0.6.21 - # via - # -r requirements/base.txt - # unstructured (setup.py) -pytz==2023.3 - # via - # -r requirements/base.txt - # pandas -regex==2023.3.23 - # via - # -r requirements/base.txt - # nltk -requests==2.28.2 - # via - # -r requirements/base.txt - # unstructured (setup.py) + # -c requirements/base.txt # wikipedia -rfc3986[idna2008]==1.5.0 - # via - # -r requirements/base.txt - # httpx -rich==13.0.1 - # via - # -r requirements/base.txt - # argilla -six==1.16.0 - # via - # -r requirements/base.txt - # python-dateutil -sniffio==1.3.0 - # via - # -r requirements/base.txt - # anyio - # httpcore - # httpx soupsieve==2.4.1 # via beautifulsoup4 -tqdm==4.65.0 +urllib3==1.26.16 # via - # -r requirements/base.txt - # argilla - # nltk -typing-extensions==4.5.0 - # via - # -r requirements/base.txt - # pydantic - # rich -urllib3==1.26.15 - # via - # -r requirements/base.txt + # -c requirements/base.txt + # -c requirements/constraints.in # requests wikipedia==1.4.0 - # via unstructured (setup.py) -wrapt==1.14.1 - # via - # -r requirements/base.txt - # argilla - # deprecated -xlsxwriter==3.1.0 - # via - # -r requirements/base.txt - # python-pptx -zipp==3.15.0 - # via - # -r requirements/base.txt - # importlib-metadata + # via -r requirements/ingest-wikipedia.in diff --git a/requirements/local-inference.in b/requirements/local-inference.in new file mode 100644 index 0000000000..3c7368e95a --- /dev/null +++ b/requirements/local-inference.in @@ -0,0 +1,3 @@ +-c constraints.in +-c base.txt +unstructured-inference==0.4.4 diff --git a/requirements/local-inference.txt b/requirements/local-inference.txt index a1629105dc..9885124069 100644 --- a/requirements/local-inference.txt +++ b/requirements/local-inference.txt @@ -2,71 +2,61 @@ # This file is autogenerated by pip-compile with Python 3.8 # by the following command: # -# pip-compile --extra=local-inference --output-file=requirements/local-inference.txt +# pip-compile requirements/local-inference.in # antlr4-python3-runtime==4.9.3 # via omegaconf anyio==3.6.2 # via - # httpcore + # -c requirements/base.txt # starlette -argilla==1.6.0 - # via unstructured (setup.py) -backoff==2.2.1 - # via argilla -certifi==2022.12.7 - # via - # httpcore - # httpx +certifi==2023.5.7 + # via + # -c requirements/base.txt + # -c requirements/constraints.in # requests - # unstructured (setup.py) cffi==1.15.1 - # via cryptography + # via + # -c requirements/base.txt + # cryptography charset-normalizer==3.1.0 # via + # -c requirements/base.txt # pdfminer-six # requests click==8.1.3 # via - # nltk + # -c requirements/base.txt # uvicorn coloredlogs==15.0.1 # via onnxruntime -commonmark==0.9.1 - # via rich contourpy==1.0.7 # via matplotlib cryptography==40.0.2 - # via pdfminer-six + # via + # -c requirements/base.txt + # pdfminer-six cycler==0.11.0 # via matplotlib -deprecated==1.2.13 - # via argilla -effdet==0.3.0 +effdet==0.4.1 # via layoutparser -et-xmlfile==1.1.0 - # via openpyxl -fastapi==0.95.1 +fastapi==0.95.2 # via unstructured-inference filelock==3.12.0 # via # huggingface-hub # torch # transformers -flatbuffers==23.3.3 +flatbuffers==23.5.9 # via onnxruntime -fonttools==4.39.3 +fonttools==4.39.4 # via matplotlib -fsspec==2023.4.0 +fsspec==2023.5.0 # via huggingface-hub h11==0.14.0 # via - # httpcore + # -c requirements/base.txt # uvicorn -httpcore==0.16.3 - # via httpx -httpx==0.23.3 - # via argilla huggingface-hub==0.14.1 # via # timm @@ -76,47 +66,30 @@ humanfriendly==10.0 # via coloredlogs idna==3.4 # via + # -c requirements/base.txt # anyio # requests - # rfc3986 -importlib-metadata==6.6.0 - # via markdown importlib-resources==5.12.0 # via matplotlib iopath==0.1.10 # via layoutparser jinja2==3.1.2 # via torch -joblib==1.2.0 - # via nltk kiwisolver==1.4.4 # via matplotlib layoutparser[layoutmodels,tesseract]==0.3.4 # via unstructured-inference -lxml==4.9.2 - # via - # python-docx - # python-pptx - # unstructured (setup.py) -markdown==3.4.3 - # via unstructured (setup.py) markupsafe==2.1.2 # via jinja2 matplotlib==3.7.1 # via pycocotools -monotonic==1.6 - # via argilla mpmath==1.3.0 # via sympy -msg-parser==1.2.0 - # via unstructured (setup.py) networkx==3.1 # via torch -nltk==3.8.1 - # via unstructured (setup.py) numpy==1.23.5 # via - # argilla + # -c requirements/base.txt # contourpy # layoutparser # matplotlib @@ -127,8 +100,6 @@ numpy==1.23.5 # scipy # torchvision # transformers -olefile==0.46 - # via msg-parser omegaconf==2.3.0 # via effdet onnxruntime==1.14.1 @@ -137,11 +108,9 @@ opencv-python==4.7.0.72 # via # layoutparser # unstructured-inference -openpyxl==3.1.2 - # via unstructured (setup.py) packaging==23.1 # via - # argilla + # -c requirements/base.txt # huggingface-hub # matplotlib # onnxruntime @@ -149,61 +118,56 @@ packaging==23.1 # transformers pandas==1.5.3 # via - # argilla + # -c requirements/base.txt # layoutparser - # unstructured (setup.py) pdf2image==1.16.3 # via layoutparser pdfminer-six==20221105 # via + # -c requirements/base.txt # pdfplumber - # unstructured (setup.py) pdfplumber==0.9.0 # via layoutparser pillow==9.5.0 # via + # -c requirements/base.txt # layoutparser # matplotlib # pdf2image # pdfplumber # pytesseract - # python-pptx # torchvision - # unstructured (setup.py) portalocker==2.7.0 # via iopath -protobuf==4.22.4 - # via onnxruntime +protobuf==3.20.3 + # via + # -c requirements/constraints.in + # onnxruntime pycocotools==2.0.6 # via effdet pycparser==2.21 - # via cffi -pydantic==1.10.7 # via - # argilla + # -c requirements/base.txt + # cffi +pydantic==1.10.8 + # via + # -c requirements/base.txt # fastapi -pygments==2.15.1 - # via rich -pypandoc==1.11 - # via unstructured (setup.py) pyparsing==3.0.9 # via matplotlib pytesseract==0.3.10 # via layoutparser python-dateutil==2.8.2 # via + # -c requirements/base.txt # matplotlib # pandas -python-docx==0.8.11 - # via unstructured (setup.py) -python-magic==0.4.27 - # via unstructured (setup.py) python-multipart==0.0.6 # via unstructured-inference -python-pptx==0.6.21 - # via unstructured (setup.py) pytz==2023.3 - # via pandas + # via + # -c requirements/base.txt + # pandas pyyaml==6.0 # via # huggingface-hub @@ -213,80 +177,75 @@ pyyaml==6.0 # transformers regex==2023.5.5 # via - # nltk + # -c requirements/base.txt # transformers -requests==2.30.0 +requests==2.31.0 # via + # -c requirements/base.txt # huggingface-hub # torchvision # transformers - # unstructured (setup.py) -rfc3986[idna2008]==1.5.0 - # via httpx -rich==13.0.1 - # via argilla +safetensors==0.3.1 + # via timm scipy==1.10.1 # via layoutparser six==1.16.0 - # via python-dateutil + # via + # -c requirements/base.txt + # python-dateutil sniffio==1.3.0 # via + # -c requirements/base.txt # anyio - # httpcore - # httpx -starlette==0.26.1 +starlette==0.27.0 # via fastapi -sympy==1.11.1 +sympy==1.12 # via # onnxruntime # torch -timm==0.6.13 +timm==0.9.2 # via effdet tokenizers==0.13.3 # via transformers -torch==2.0.0 +torch==2.0.1 # via # effdet # layoutparser # timm # torchvision -torchvision==0.15.1 +torchvision==0.15.2 # via # effdet # layoutparser # timm tqdm==4.65.0 # via - # argilla + # -c requirements/base.txt # huggingface-hub # iopath - # nltk # transformers -transformers==4.28.1 +transformers==4.29.2 # via unstructured-inference -typing-extensions==4.5.0 +typing-extensions==4.6.0 # via + # -c requirements/base.txt # huggingface-hub # iopath # pydantic - # rich # starlette # torch unstructured-inference==0.4.4 - # via unstructured (setup.py) -urllib3==2.0.2 - # via requests + # via -r requirements/local-inference.in +urllib3==1.26.16 + # via + # -c requirements/base.txt + # -c requirements/constraints.in + # requests uvicorn==0.22.0 # via unstructured-inference wand==0.6.11 # via pdfplumber -wrapt==1.14.1 - # via - # argilla - # deprecated -xlsxwriter==3.1.0 - # via python-pptx zipp==3.15.0 # via - # importlib-metadata + # -c requirements/base.txt # importlib-resources diff --git a/requirements/test.in b/requirements/test.in index 9a2aa11531..730b310f5b 100644 --- a/requirements/test.in +++ b/requirements/test.in @@ -1,3 +1,5 @@ +-c constraints.in +-c base.txt black>=22.3.0 coverage # NOTE(mrobinson) - Pinning click due to a unicode issue in black @@ -14,7 +16,3 @@ types-tabulate types-requests vcrpy ruff - -# NOTE(robinson) - The following pins are to address -# vulnerabilities in dependency scans -certifi>=2022.12.07 diff --git a/requirements/test.txt b/requirements/test.txt index fc12242e5d..65c107fd76 100644 --- a/requirements/test.txt +++ b/requirements/test.txt @@ -4,21 +4,22 @@ # # pip-compile requirements/test.in # ---extra-index-url https://pypi.ngc.nvidia.com ---trusted-host pypi.ngc.nvidia.com - appdirs==1.4.4 # via label-studio-tools black==23.3.0 # via -r requirements/test.in -certifi==2022.12.7 +certifi==2023.5.7 # via - # -r requirements/test.in + # -c requirements/base.txt + # -c requirements/constraints.in # requests charset-normalizer==3.1.0 - # via requests + # via + # -c requirements/base.txt + # requests click==8.1.3 # via + # -c requirements/base.txt # -r requirements/test.in # black coverage[toml]==7.2.5 @@ -31,23 +32,25 @@ flake8==6.0.0 # via -r requirements/test.in idna==3.4 # via + # -c requirements/base.txt # requests # yarl iniconfig==2.0.0 # via pytest -label-studio-sdk==0.0.23 +label-studio-sdk==0.0.27 # via -r requirements/test.in label-studio-tools==0.0.2 # via label-studio-sdk lxml==4.9.2 # via + # -c requirements/base.txt # label-studio-sdk # label-studio-tools mccabe==0.7.0 # via flake8 multidict==6.0.4 # via yarl -mypy==1.2.0 +mypy==1.3.0 # via -r requirements/test.in mypy-extensions==1.0.0 # via @@ -55,18 +58,21 @@ mypy-extensions==1.0.0 # mypy packaging==23.1 # via + # -c requirements/base.txt # black # pytest pathspec==0.11.1 # via black -platformdirs==3.5.0 +platformdirs==3.5.1 # via black pluggy==1.0.0 # via pytest pycodestyle==2.10.0 # via flake8 -pydantic==1.10.7 - # via label-studio-sdk +pydantic==1.10.8 + # via + # -c requirements/base.txt + # label-studio-sdk pyflakes==3.0.1 # via flake8 pytest==7.3.1 @@ -79,36 +85,46 @@ pytest-mock==3.10.0 # via -r requirements/test.in pyyaml==6.0 # via vcrpy -requests==2.30.0 - # via label-studio-sdk -ruff==0.0.265 +requests==2.31.0 + # via + # -c requirements/base.txt + # label-studio-sdk +ruff==0.0.269 # via -r requirements/test.in six==1.16.0 - # via vcrpy + # via + # -c requirements/base.txt + # vcrpy tomli==2.0.1 # via # black # coverage # mypy # pytest -types-markdown==3.4.2.8 +types-markdown==3.4.2.9 # via -r requirements/test.in -types-requests==2.30.0.0 +types-requests==2.31.0.0 # via -r requirements/test.in types-tabulate==0.9.0.2 # via -r requirements/test.in -types-urllib3==1.26.25.12 +types-urllib3==1.26.25.13 # via types-requests -typing-extensions==4.5.0 +typing-extensions==4.6.0 # via + # -c requirements/base.txt # black # mypy # pydantic -urllib3==2.0.2 - # via requests +urllib3==1.26.16 + # via + # -c requirements/base.txt + # -c requirements/constraints.in + # requests vcrpy==4.2.1 # via -r requirements/test.in -wrapt==1.15.0 - # via vcrpy +wrapt==1.14.1 + # via + # -c requirements/base.txt + # vcrpy yarl==1.9.2 # via vcrpy diff --git a/scripts/consistent-deps.sh b/scripts/consistent-deps.sh new file mode 100755 index 0000000000..f64d65c638 --- /dev/null +++ b/scripts/consistent-deps.sh @@ -0,0 +1,45 @@ +#!/usr/bin/env bash + +#################################################################################################### +# Check depedency consistency by forcing pip to resolve all the requirement .txt files at once +# (without installing). +#################################################################################################### + +echo "Checking consistency of dependencies..." + +# Joins an array of strings using the specified delimiter. +function join_by { + local d=${1-} f=${2-} + if shift 2; then + printf %s "$f" "${@/#/$d}" + fi +} + +# NOTE(alan): Add any dependency files here we don't want to include in the resolution. +excludefiles=("requirements/build.txt") + +# Build an array of requirements files. +shopt -s nullglob +reqfiles=(requirements/*.txt) + +# Remove the excluded files from the array of requirements files. +for excludefile in "${excludefiles[@]}"; do + for i in "${!reqfiles[@]}"; do + if [[ ${reqfiles[i]} = "$excludefile" ]]; then + unset 'reqfiles[i]' + fi + done +done + +# Turn the requirement files array into pip -r flags. +reqstring=$(join_by ' -r ' "${reqfiles[@]}") +reqstring="-r ${reqstring}" + +# This pip command will attempt to resolve the dependencies without installing anything. +pipcommand="pip install --dry-run --ignore-installed ${reqstring}" +if $pipcommand >> /dev/null; +then + echo "Everything looks fine!"; +else + exit 1 +fi diff --git a/setup.py b/setup.py index f6e6207d71..4c7f837413 100644 --- a/setup.py +++ b/setup.py @@ -17,10 +17,28 @@ See the License for the specific language governing permissions and limitations under the License. """ +from typing import List, Optional, Union + from setuptools import find_packages, setup from unstructured.__version__ import __version__ + +def load_requirements(file_list: Optional[Union[str, List[str]]] = None) -> List[str]: + if file_list is None: + file_list = ["requirements/base.in"] + if isinstance(file_list, str): + file_list = [file_list] + requirements: List[str] = [] + for file in file_list: + with open(file, encoding="utf-8") as f: + requirements.extend(f.readlines()) + requirements = [ + req for req in requirements if not req.startswith("#") and not req.startswith("-") + ] + return requirements + + setup( name="unstructured", description="A library that prepares raw documents for downstream ML tasks.", @@ -49,54 +67,19 @@ entry_points={ "console_scripts": ["unstructured-ingest=unstructured.ingest.main:main"], }, - install_requires=[ - "argilla", - "chardet", - "lxml", - "msg_parser", - "nltk", - "openpyxl", - "pandas", - "pdfminer.six", - "pillow", - "pypandoc", - "python-docx", - "python-pptx", - "python-magic", - "markdown", - "requests", - # NOTE(robinson) - The following dependencies are pinned - # to address security scans - "certifi>=2022.12.07", - ], + install_requires=load_requirements(), extras_require={ - "huggingface": [ - "langdetect", - "sacremoses", - "sentencepiece", - "torch", - "transformers", - ], - "local-inference": [ - "unstructured-inference==0.4.4", - ], - "s3": ["s3fs", "fsspec"], - "azure": ["adlfs", "fsspec"], - "discord": ["discord.py"], - "github": [ - # NOTE - pygithub==1.58.0 fails due to https://github.com/PyGithub/PyGithub/issues/2436 - # In the future, we can update this to pygithub>1.58.0 - "pygithub==1.57.0", - ], - "gitlab": ["python-gitlab"], - "reddit": ["praw"], - "slack": ["slack_sdk"], - "wikipedia": ["wikipedia"], - "google-drive": [ - "google-api-python-client", - # consistency with local-inference-pin - "protobuf<3.21", - ], + "huggingface": load_requirements("requirements/huggingface.in"), + "local-inference": load_requirements("requirements/local-inference.in"), + "s3": load_requirements("requirements/ingest-s3.in"), + "azure": load_requirements("requirements/ingest-azure.in"), + "discord": load_requirements("requirements/ingest-discord.in"), + "github": load_requirements("requirements/ingest-github.in"), + "gitlab": load_requirements("requirements/ingest-gitlab.in"), + "reddit": load_requirements("requirements/ingest-reddit.in"), + "slack": load_requirements("requirements/ingest-slack.in"), + "wikipedia": load_requirements("requirements/ingest-wikipedia.in"), + "google-drive": load_requirements("requirements/ingest-google-drive.in"), }, package_dir={"unstructured": "unstructured"}, package_data={"unstructured": ["nlp/*.txt"]}, diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 6a697d255e..050b3f2ca1 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.6.9-dev2" # pragma: no cover +__version__ = "0.6.9" # pragma: no cover