From ffa1803f5e1793f4157d1e4d0dcdf1b27c8cde40 Mon Sep 17 00:00:00 2001 From: Michael Landis Date: Fri, 19 May 2023 20:38:42 -0700 Subject: [PATCH 1/2] fix: use docarray extras instead of specifying separate deps The docarray [integration PR](https://github.com/hwchase17/langchain/pull/4483) introduced a pinned dependency to protobuf. As library developers, we should avoid pinned dependencies as this quickly leads to incompatibilities with application code. Much less with a heavily used library like protobuf. As we see in the [docarray integration](https://github.com/hwchase17/langchain/pull/4483/files#diff-50c86b7ed8ac2cf95bd48334961bf0530cdc77b5a56f852c5c61b89d735fd711R81-R83), the transitive dependencies of docarray were also listed as langchain dependencies. This is unnecessary as the docarray project has an [extras install](https://github.com/docarray/docarray/blob/a01a05542d17264b8a164bec783633658deeedb8/pyproject.toml#L70) that lists these. This PR reverts the explicit hnswlib and protobuf dependencies and adds the hnswlib extras install for docarray (which installs hnswlib and protobuf). Because version 0.32.0 of the docarray hnswlib extras added protobuf, we bump the docarray dependency from `^0.31.0` to `^0.32.0`. --- poetry.lock | 16 +++++++++------- pyproject.toml | 8 +++----- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/poetry.lock b/poetry.lock index afef457d9c57b..64f96ed4f0197 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1625,19 +1625,21 @@ wmi = ["wmi (>=1.5.1,<2.0.0)"] [[package]] name = "docarray" -version = "0.31.1" +version = "0.32.0" description = "The data structure for multimodal data" category = "main" optional = true python-versions = ">=3.7,<4.0" files = [ - {file = "docarray-0.31.1-py3-none-any.whl", hash = "sha256:286842c84a9946648f36b2a4dc33bcb47589780b4614e5cd32ce67c5a46cb4c0"}, - {file = "docarray-0.31.1.tar.gz", hash = "sha256:096b1eabf0be3c0b1517bbbe82485c19a0de61dde24b8f3448f26c5ead672c4a"}, + {file = "docarray-0.32.0-py3-none-any.whl", hash = "sha256:5216858966ea42133614be421ef7ae670d020bfdfcd2ab3e0118a4a8ecc77034"}, + {file = "docarray-0.32.0.tar.gz", hash = "sha256:7a3156cb0d13dec7d6b85f193b339b823748446fc9fff1e0ca4c2ef50b4183d2"}, ] [package.dependencies] +hnswlib = {version = ">=0.6.2", optional = true, markers = "extra == \"hnswlib\""} numpy = ">=1.17.3" orjson = ">=3.8.2" +protobuf = {version = ">=3.19.0", optional = true, markers = "extra == \"proto\" or extra == \"hnswlib\" or extra == \"full\""} pydantic = ">=1.10.2" rich = ">=13.1.0" types-requests = ">=2.28.11.6" @@ -1648,7 +1650,7 @@ audio = ["pydub (>=0.25.1,<0.26.0)"] aws = ["smart-open[s3] (>=6.3.0)"] elasticsearch = ["elastic-transport (>=8.4.0,<9.0.0)", "elasticsearch (>=7.10.1)"] full = ["av (>=10.0.0)", "lz4 (>=1.0.0)", "pandas (>=1.1.0)", "pillow (>=9.3.0)", "protobuf (>=3.19.0)", "pydub (>=0.25.1,<0.26.0)", "trimesh[easy] (>=3.17.1)", "types-pillow (>=9.3.0.1)"] -hnswlib = ["hnswlib (>=0.6.2)"] +hnswlib = ["hnswlib (>=0.6.2)", "protobuf (>=3.19.0)"] image = ["pillow (>=9.3.0)", "types-pillow (>=9.3.0.1)"] jac = ["jina-hubble-sdk (>=0.34.0)"] mesh = ["trimesh[easy] (>=3.17.1)"] @@ -10330,12 +10332,12 @@ cffi = {version = ">=1.11", markers = "platform_python_implementation == \"PyPy\ cffi = ["cffi (>=1.11)"] [extras] -all = ["O365", "aleph-alpha-client", "anthropic", "arxiv", "atlassian-python-api", "azure-cosmos", "azure-identity", "beautifulsoup4", "clickhouse-connect", "cohere", "deeplake", "docarray", "duckduckgo-search", "elasticsearch", "faiss-cpu", "google-api-python-client", "google-search-results", "gptcache", "gql", "hnswlib", "html2text", "huggingface_hub", "jina", "jinja2", "jq", "lancedb", "lark", "manifest-ml", "networkx", "nlpcloud", "nltk", "nomic", "openai", "opensearch-py", "pdfminer-six", "pexpect", "pgvector", "pinecone-client", "pinecone-text", "protobuf", "psycopg2-binary", "pyowm", "pypdf", "pytesseract", "pyvespa", "qdrant-client", "redis", "sentence-transformers", "spacy", "steamship", "tensorflow-text", "tiktoken", "torch", "transformers", "weaviate-client", "wikipedia", "wolframalpha"] +all = ["O365", "aleph-alpha-client", "anthropic", "arxiv", "atlassian-python-api", "azure-cosmos", "azure-identity", "beautifulsoup4", "clickhouse-connect", "cohere", "deeplake", "docarray", "duckduckgo-search", "elasticsearch", "faiss-cpu", "google-api-python-client", "google-search-results", "gptcache", "gql", "html2text", "huggingface_hub", "jina", "jinja2", "jq", "lancedb", "lark", "manifest-ml", "networkx", "nlpcloud", "nltk", "nomic", "openai", "opensearch-py", "pdfminer-six", "pexpect", "pgvector", "pinecone-client", "pinecone-text", "psycopg2-binary", "pyowm", "pypdf", "pytesseract", "pyvespa", "qdrant-client", "redis", "sentence-transformers", "spacy", "steamship", "tensorflow-text", "tiktoken", "torch", "transformers", "weaviate-client", "wikipedia", "wolframalpha"] azure = ["azure-core", "azure-cosmos", "azure-identity", "openai"] cohere = ["cohere"] embeddings = ["sentence-transformers"] extended-testing = ["atlassian-python-api", "beautifulsoup4", "beautifulsoup4", "chardet", "jq", "lxml", "pandas", "pdfminer-six", "pymupdf", "pypdf", "pypdfium2", "telethon", "tqdm", "zep-python"] -hnswlib = ["docarray", "hnswlib", "protobuf"] +hnswlib = ["docarray"] in-memory-store = ["docarray"] llms = ["anthropic", "cohere", "huggingface_hub", "manifest-ml", "nlpcloud", "openai", "torch", "transformers"] openai = ["openai", "tiktoken"] @@ -10345,4 +10347,4 @@ text-helpers = ["chardet"] [metadata] lock-version = "2.0" python-versions = ">=3.8.1,<4.0" -content-hash = "2b19b9deca7f83ca14af1f7bc7808bbe7873a91ce4c95381eaad8ea84fe04c0b" +content-hash = "dc7fd55ba9f97482e3be80367c0ec050df7d6648543bfc8fc4c78c43ad2e7ccc" diff --git a/pyproject.toml b/pyproject.toml index 0164205464985..fa870568be290 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -79,9 +79,7 @@ O365 = {version = "^2.0.26", optional = true} jq = {version = "^1.4.1", optional = true} steamship = {version = "^2.16.9", optional = true} pdfminer-six = {version = "^20221105", optional = true} -docarray = {version="^0.31.0", optional=true} -protobuf = {version="3.19.6", optional=true} -hnswlib = {version="^0.7.0", optional=true} +docarray = {version="^0.32.0", extras=["hnswlib"], optional=true} lxml = {version = "^4.9.2", optional = true} pymupdf = {version = "^1.22.3", optional = true} pypdfium2 = {version = "^4.10.0", optional = true} @@ -180,10 +178,10 @@ openai = ["openai", "tiktoken"] text_helpers = ["chardet"] cohere = ["cohere"] in_memory_store = ["docarray"] -hnswlib = ["docarray", "protobuf", "hnswlib"] +hnswlib = ["docarray"] embeddings = ["sentence-transformers"] azure = ["azure-identity", "azure-cosmos", "openai", "azure-core"] -all = ["anthropic", "cohere", "openai", "nlpcloud", "huggingface_hub", "jina", "manifest-ml", "elasticsearch", "opensearch-py", "google-search-results", "faiss-cpu", "sentence-transformers", "transformers", "spacy", "nltk", "wikipedia", "beautifulsoup4", "tiktoken", "torch", "jinja2", "pinecone-client", "pinecone-text", "weaviate-client", "redis", "google-api-python-client", "wolframalpha", "qdrant-client", "tensorflow-text", "pypdf", "networkx", "nomic", "aleph-alpha-client", "deeplake", "pgvector", "psycopg2-binary", "pyowm", "pytesseract", "html2text", "atlassian-python-api", "gptcache", "duckduckgo-search", "arxiv", "azure-identity", "clickhouse-connect", "azure-cosmos", "lancedb", "lark", "pexpect", "pyvespa", "O365", "jq", "docarray", "protobuf", "hnswlib", "steamship", "pdfminer-six", "gql"] +all = ["anthropic", "cohere", "openai", "nlpcloud", "huggingface_hub", "jina", "manifest-ml", "elasticsearch", "opensearch-py", "google-search-results", "faiss-cpu", "sentence-transformers", "transformers", "spacy", "nltk", "wikipedia", "beautifulsoup4", "tiktoken", "torch", "jinja2", "pinecone-client", "pinecone-text", "weaviate-client", "redis", "google-api-python-client", "wolframalpha", "qdrant-client", "tensorflow-text", "pypdf", "networkx", "nomic", "aleph-alpha-client", "deeplake", "pgvector", "psycopg2-binary", "pyowm", "pytesseract", "html2text", "atlassian-python-api", "gptcache", "duckduckgo-search", "arxiv", "azure-identity", "clickhouse-connect", "azure-cosmos", "lancedb", "lark", "pexpect", "pyvespa", "O365", "jq", "docarray", "steamship", "pdfminer-six", "gql"] # An extra used to be able to add extended testing. # Please use new-line on formatting to make it easier to add new packages without # merge-conflicts From 07a4ded643441e021e0c2634f1d60298f699f2b2 Mon Sep 17 00:00:00 2001 From: Michael Landis Date: Sat, 20 May 2023 08:01:07 -0700 Subject: [PATCH 2/2] chore: clarify docarray extras name An extra was previously introduced for "hnswlib" which only installed "docarray". While "hnswlib" is a dependency of docarray, docarray is a separate integration, so this is misleading to users. There was also a separate extras for "in_memory_store". This is again misleading since there are various in memory stores in the system that are very specific. --- poetry.lock | 5 ++--- pyproject.toml | 3 +-- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/poetry.lock b/poetry.lock index dda7292639bf5..0f5077f23b6ad 100644 --- a/poetry.lock +++ b/poetry.lock @@ -10350,10 +10350,9 @@ cffi = ["cffi (>=1.11)"] all = ["O365", "aleph-alpha-client", "anthropic", "arxiv", "atlassian-python-api", "azure-cosmos", "azure-identity", "beautifulsoup4", "clickhouse-connect", "cohere", "deeplake", "docarray", "duckduckgo-search", "elasticsearch", "faiss-cpu", "google-api-python-client", "google-search-results", "gptcache", "html2text", "huggingface_hub", "jina", "jinja2", "jq", "lancedb", "lark", "lxml", "manifest-ml", "networkx", "nlpcloud", "nltk", "nomic", "openai", "opensearch-py", "pdfminer-six", "pexpect", "pgvector", "pinecone-client", "pinecone-text", "psycopg2-binary", "pyowm", "pypdf", "pytesseract", "pyvespa", "qdrant-client", "redis", "sentence-transformers", "spacy", "steamship", "tensorflow-text", "tiktoken", "torch", "transformers", "weaviate-client", "wikipedia", "wolframalpha"] azure = ["azure-core", "azure-cosmos", "azure-identity", "openai"] cohere = ["cohere"] +docarray = ["docarray"] embeddings = ["sentence-transformers"] extended-testing = ["atlassian-python-api", "beautifulsoup4", "beautifulsoup4", "chardet", "gql", "html2text", "jq", "lxml", "pandas", "pdfminer-six", "pymupdf", "pypdf", "pypdfium2", "requests-toolbelt", "telethon", "tqdm", "zep-python"] -hnswlib = ["docarray"] -in-memory-store = ["docarray"] llms = ["anthropic", "cohere", "huggingface_hub", "manifest-ml", "nlpcloud", "openai", "torch", "transformers"] openai = ["openai", "tiktoken"] qdrant = ["qdrant-client"] @@ -10362,4 +10361,4 @@ text-helpers = ["chardet"] [metadata] lock-version = "2.0" python-versions = ">=3.8.1,<4.0" -content-hash = "316d78c41944c9810a546910437703e0acabe3d05e22a10ce9bbcab7d0d89679" +content-hash = "dbf7444951485c14edd73ebfe7ffabb4c44d8cde5729ca30e5acc98a71e1a132" diff --git a/pyproject.toml b/pyproject.toml index 457fa135ed228..a9c6229c953c0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -178,8 +178,7 @@ qdrant = ["qdrant-client"] openai = ["openai", "tiktoken"] text_helpers = ["chardet"] cohere = ["cohere"] -in_memory_store = ["docarray"] -hnswlib = ["docarray"] +docarray = ["docarray"] embeddings = ["sentence-transformers"] azure = ["azure-identity", "azure-cosmos", "openai", "azure-core"] all = ["anthropic", "cohere", "openai", "nlpcloud", "huggingface_hub", "jina", "manifest-ml", "elasticsearch", "opensearch-py", "google-search-results", "faiss-cpu", "sentence-transformers", "transformers", "spacy", "nltk", "wikipedia", "beautifulsoup4", "tiktoken", "torch", "jinja2", "pinecone-client", "pinecone-text", "weaviate-client", "redis", "google-api-python-client", "wolframalpha", "qdrant-client", "tensorflow-text", "pypdf", "networkx", "nomic", "aleph-alpha-client", "deeplake", "pgvector", "psycopg2-binary", "pyowm", "pytesseract", "html2text", "atlassian-python-api", "gptcache", "duckduckgo-search", "arxiv", "azure-identity", "clickhouse-connect", "azure-cosmos", "lancedb", "lark", "pexpect", "pyvespa", "O365", "jq", "docarray", "steamship", "pdfminer-six", "lxml"]