diff --git a/backend/poetry.lock b/backend/poetry.lock index be2c123..ebd1a0d 100644 --- a/backend/poetry.lock +++ b/backend/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.7.0 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand. [[package]] name = "aiohttp" @@ -366,6 +366,21 @@ files = [ marshmallow = ">=3.18.0,<4.0.0" typing-inspect = ">=0.4.0,<1" +[[package]] +name = "dataclasses-json-speakeasy" +version = "0.5.11" +description = "Easily serialize dataclasses to and from JSON." +optional = false +python-versions = ">=3.7,<4.0" +files = [ + {file = "dataclasses_json_speakeasy-0.5.11-py3-none-any.whl", hash = "sha256:ac52a069a01e8521015d682f37849bfdf056c36fa3f81497055e201fec684104"}, + {file = "dataclasses_json_speakeasy-0.5.11.tar.gz", hash = "sha256:418a987cea2ccf4e4be662f39faa5cc79b47b147c9d1a69d6928d6a27e0c17e8"}, +] + +[package.dependencies] +marshmallow = ">=3.18.0,<4.0.0" +typing-inspect = ">=0.4.0,<1" + [[package]] name = "deprecated" version = "1.2.14" @@ -785,6 +800,17 @@ files = [ {file = "joblib-1.3.2.tar.gz", hash = "sha256:92f865e621e17784e7955080b6d042489e3b8e294949cc44c6eac304f59772b1"}, ] +[[package]] +name = "jsonpath-python" +version = "1.0.6" +description = "A more powerful JSONPath implementation in modern python" +optional = false +python-versions = ">=3.6" +files = [ + {file = "jsonpath-python-1.0.6.tar.gz", hash = "sha256:dd5be4a72d8a2995c3f583cf82bf3cd1a9544cfdabf2d22595b67aff07349666"}, + {file = "jsonpath_python-1.0.6-py3-none-any.whl", hash = "sha256:1e3b78df579f5efc23565293612decee04214609208a2335884b3ee3f786b575"}, +] + [[package]] name = "langdetect" version = "1.0.9" @@ -1812,6 +1838,7 @@ files = [ {file = "PyYAML-6.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:bf07ee2fef7014951eeb99f56f39c9bb4af143d8aa3c21b1677805985307da34"}, {file = "PyYAML-6.0.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:855fb52b0dc35af121542a76b9a84f8d1cd886ea97c84703eaa6d88e37a2ad28"}, {file = "PyYAML-6.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:40df9b996c2b73138957fe23a16a4f0ba614f4c0efce1e9406a184b6d07fa3a9"}, + {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a08c6f0fe150303c1c6b71ebcd7213c2858041a7e01975da3a99aed1e7a378ef"}, {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c22bec3fbe2524cde73d7ada88f6566758a8f7227bfbf93a408a9d86bcc12a0"}, {file = "PyYAML-6.0.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8d4e9c88387b0f5c7d5f281e55304de64cf7f9c0021a3525bd3b1c542da3b0e4"}, {file = "PyYAML-6.0.1-cp312-cp312-win32.whl", hash = "sha256:d483d2cdf104e7c9fa60c544d92981f12ad66a457afae824d146093b8c294c54"}, @@ -2342,13 +2369,13 @@ files = [ [[package]] name = "unstructured" -version = "0.10.30" +version = "0.14.3" description = "A library that prepares raw documents for downstream ML tasks." optional = false -python-versions = ">=3.7.0" +python-versions = "<3.13,>=3.9.0" files = [ - {file = "unstructured-0.10.30-py3-none-any.whl", hash = "sha256:0615f14daa37450e9c0fcf3c3fd178c3a06b6b8d006a36d1a5e54dbe487aa6b6"}, - {file = "unstructured-0.10.30.tar.gz", hash = "sha256:a86c3d15c572a28322d83cb5ecf0ac7a24f1c36864fb7c68df096de8a1acc106"}, + {file = "unstructured-0.14.3-py3-none-any.whl", hash = "sha256:9f94607aaae66a543315062bf10bd3a196b5809d2747e15567d1a814764bcf8c"}, + {file = "unstructured-0.14.3.tar.gz", hash = "sha256:57156e67edf91d1f1a3c13451ca0dd834a8108090eca3cd2e3a99eb045b18612"}, ] [package.dependencies] @@ -2368,56 +2395,103 @@ rapidfuzz = "*" requests = "*" tabulate = "*" typing-extensions = "*" +unstructured-client = "*" +wrapt = "*" [package.extras] airtable = ["pyairtable"] -all-docs = ["markdown", "msg-parser", "networkx", "onnx", "openpyxl", "pandas", "pdf2image", "pdfminer.six", "pypandoc", "python-docx (>=1.1.0)", "python-pptx (<=0.6.23)", "unstructured-inference (==0.7.11)", "unstructured.pytesseract (>=0.3.12)", "xlrd"] -azure = ["adlfs", "fsspec (==2023.9.1)"] +all-docs = ["effdet", "google-cloud-vision", "markdown", "msg-parser", "networkx", "onnx", "openpyxl", "pandas", "pdf2image", "pdfminer.six", "pikepdf", "pillow-heif", "pypandoc", "pypdf", "pytesseract", "python-docx", "python-pptx (<=0.6.23)", "unstructured-inference (==0.7.33)", "unstructured.pytesseract (>=0.3.12)", "xlrd"] +astra = ["astrapy"] +azure = ["adlfs", "fsspec"] azure-cognitive-search = ["azure-search-documents"] -bedrock = ["boto3", "langchain"] +bedrock = ["boto3", "langchain-community"] biomed = ["bs4"] -box = ["boxfs", "fsspec (==2023.9.1)"] +box = ["boxfs", "fsspec"] +chroma = ["chromadb", "importlib-metadata (>=7.1.0)", "typer (<=0.9.0)"] +clarifai = ["clarifai"] confluence = ["atlassian-python-api"] csv = ["pandas"] -delta-table = ["deltalake", "fsspec (==2023.9.1)"] +databricks-volumes = ["databricks-sdk"] +delta-table = ["deltalake", "fsspec"] discord = ["discord-py"] -doc = ["python-docx (>=1.1.0)"] -docx = ["python-docx (>=1.1.0)"] -dropbox = ["dropboxdrivefs", "fsspec (==2023.9.1)"] -elasticsearch = ["elasticsearch", "jq"] -embed-huggingface = ["huggingface", "langchain", "sentence-transformers"] +doc = ["python-docx"] +docx = ["python-docx"] +dropbox = ["dropboxdrivefs", "fsspec"] +elasticsearch = ["elasticsearch"] +embed-huggingface = ["huggingface", "langchain-community", "sentence-transformers"] +embed-octoai = ["openai", "tiktoken"] +embed-vertexai = ["langchain", "langchain-community", "langchain-google-vertexai"] +embed-voyageai = ["langchain", "langchain-voyageai"] epub = ["pypandoc"] -gcs = ["bs4", "fsspec (==2023.9.1)", "gcsfs"] +gcs = ["bs4", "fsspec", "gcsfs"] github = ["pygithub (>1.58.0)"] gitlab = ["python-gitlab"] google-drive = ["google-api-python-client"] +hubspot = ["hubspot-api-client", "urllib3"] huggingface = ["langdetect", "sacremoses", "sentencepiece", "torch", "transformers"] -image = ["onnx", "pdf2image", "pdfminer.six", "unstructured-inference (==0.7.11)", "unstructured.pytesseract (>=0.3.12)"] +image = ["effdet", "google-cloud-vision", "onnx", "pdf2image", "pdfminer.six", "pikepdf", "pillow-heif", "pypdf", "pytesseract", "unstructured-inference (==0.7.33)", "unstructured.pytesseract (>=0.3.12)"] jira = ["atlassian-python-api"] -local-inference = ["markdown", "msg-parser", "networkx", "onnx", "openpyxl", "pandas", "pdf2image", "pdfminer.six", "pypandoc", "python-docx (>=1.1.0)", "python-pptx (<=0.6.23)", "unstructured-inference (==0.7.11)", "unstructured.pytesseract (>=0.3.12)", "xlrd"] +local-inference = ["effdet", "google-cloud-vision", "markdown", "msg-parser", "networkx", "onnx", "openpyxl", "pandas", "pdf2image", "pdfminer.six", "pikepdf", "pillow-heif", "pypandoc", "pypdf", "pytesseract", "python-docx", "python-pptx (<=0.6.23)", "unstructured-inference (==0.7.33)", "unstructured.pytesseract (>=0.3.12)", "xlrd"] md = ["markdown"] +mongodb = ["pymongo"] msg = ["msg-parser"] notion = ["htmlBuilder", "notion-client"] -odt = ["pypandoc", "python-docx (>=1.1.0)"] -onedrive = ["Office365-REST-Python-Client (<2.4.3)", "bs4", "msal"] -openai = ["langchain", "openai", "tiktoken"] +odt = ["pypandoc", "python-docx"] +onedrive = ["Office365-REST-Python-Client", "bs4", "msal"] +openai = ["langchain-community", "openai", "tiktoken"] +opensearch = ["opensearch-py"] org = ["pypandoc"] -outlook = ["Office365-REST-Python-Client (<2.4.3)", "msal"] +outlook = ["Office365-REST-Python-Client", "msal"] paddleocr = ["unstructured.paddleocr (==2.6.1.3)"] -pdf = ["onnx", "pdf2image", "pdfminer.six", "unstructured-inference (==0.7.11)", "unstructured.pytesseract (>=0.3.12)"] +pdf = ["effdet", "google-cloud-vision", "onnx", "pdf2image", "pdfminer.six", "pikepdf", "pillow-heif", "pypdf", "pytesseract", "unstructured-inference (==0.7.33)", "unstructured.pytesseract (>=0.3.12)"] +pinecone = ["pinecone-client (>=3.7.1)"] +postgres = ["psycopg2-binary"] ppt = ["python-pptx (<=0.6.23)"] pptx = ["python-pptx (<=0.6.23)"] +qdrant = ["qdrant-client"] reddit = ["praw"] rst = ["pypandoc"] rtf = ["pypandoc"] -s3 = ["fsspec (==2023.9.1)", "s3fs"] +s3 = ["fsspec", "s3fs"] salesforce = ["simple-salesforce"] -sharepoint = ["Office365-REST-Python-Client (<2.4.3)", "msal"] +sftp = ["fsspec", "paramiko"] +sharepoint = ["Office365-REST-Python-Client", "msal"] slack = ["slack-sdk"] tsv = ["pandas"] +weaviate = ["weaviate-client"] wikipedia = ["wikipedia"] xlsx = ["networkx", "openpyxl", "pandas", "xlrd"] +[[package]] +name = "unstructured-client" +version = "0.18.0" +description = "Python Client SDK for Unstructured API" +optional = false +python-versions = ">=3.8" +files = [ + {file = "unstructured-client-0.18.0.tar.gz", hash = "sha256:b5f1866b6a48d2e28645e37e86c9d58b1ee7df2d88e79adf873572338c027aa8"}, + {file = "unstructured_client-0.18.0-py3-none-any.whl", hash = "sha256:36d8c5cb01b97a87e271e11d4d5a063d1c5b85fc5fd7f07819c35a9bef74821f"}, +] + +[package.dependencies] +certifi = ">=2023.7.22" +charset-normalizer = ">=3.2.0" +dataclasses-json-speakeasy = ">=0.5.11" +idna = ">=3.4" +jsonpath-python = ">=1.0.6" +marshmallow = ">=3.19.0" +mypy-extensions = ">=1.0.0" +packaging = ">=23.1" +python-dateutil = ">=2.8.2" +requests = ">=2.31.0" +six = ">=1.16.0" +typing-extensions = ">=4.7.1" +typing-inspect = ">=0.9.0" +urllib3 = ">=1.26.18" + +[package.extras] +dev = ["pylint (==2.16.2)"] + [[package]] name = "urllib3" version = "2.1.0" @@ -2856,4 +2930,4 @@ multidict = ">=4.0" [metadata] lock-version = "2.0" python-versions = "^3.11,<3.12" -content-hash = "6b6b53757e8759ddb6d2b19538614c36c7230e136e7e44926b9037645951cd93" +content-hash = "9ee92be8a75f8c2aff95f4decaac99ff760668c0673839045ec8d7b8719ee674" diff --git a/backend/pyproject.toml b/backend/pyproject.toml index e131c9d..4a5ac66 100644 --- a/backend/pyproject.toml +++ b/backend/pyproject.toml @@ -10,7 +10,7 @@ python = "^3.11,<3.12" fastapi = "^0.104.1" uvicorn = { extras = ["standard"], version = "^0.23.2" } python-dotenv = "^1.0.0" -unstructured = "0.10.30" +unstructured = "0.14.3" Jinja2 = "3.1.2" llama-index-core = "^0.10.18.post1" llama-index-embeddings-openai = "^0.1.6"