-
Notifications
You must be signed in to change notification settings - Fork 15.9k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Implement NucliaDB vector store (#10236)
# Description This pull request allows to use the [NucliaDB](https://docs.nuclia.dev/docs/docs/nucliadb/intro) as a vector store in LangChain. It works with both a [local NucliaDB instance](https://docs.nuclia.dev/docs/docs/nucliadb/deploy/basics) or with [Nuclia Cloud](https://nuclia.cloud). # Dependencies It requires an up-to-date version of the `nuclia` Python package. @rlancemartin, @eyurtsev, @hinthornw, please review it when you have a moment :) Note: our Twitter handler is `@NucliaAI`
- Loading branch information
Showing
6 changed files
with
714 additions
and
172 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
151 changes: 151 additions & 0 deletions
151
docs/extras/integrations/vectorstores/nucliadb_vectorstore.ipynb
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,151 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"# NucliaDB vector store\n", | ||
"\n", | ||
"You can use a local NucliaDB instance or use [Nuclia Cloud](https://nuclia.cloud).\n", | ||
"\n", | ||
"When using a local instance, you need a Nuclia Understanding API key, so your texts are properly vectorized and indexed. You can get a key by creating a free account at [https://nuclia.cloud](https://nuclia.cloud), and then [create a NUA key](https://docs.nuclia.dev/docs/docs/using/understanding/intro)." | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"#!pip install nuclia" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"## Usage with nuclia.cloud" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 1, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"ename": "ValueError", | ||
"evalue": "nuclia python package not found. Please install it with `pip install nuclia`.", | ||
"output_type": "error", | ||
"traceback": [ | ||
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", | ||
"\u001b[0;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)", | ||
"File \u001b[0;32m~/dev/osprojects/langchain/libs/langchain/langchain/vectorstores/nucliadb.py:39\u001b[0m, in \u001b[0;36mNucliaDB.__init__\u001b[0;34m(self, knowledge_box, local, api_key, backend)\u001b[0m\n\u001b[1;32m 38\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[0;32m---> 39\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39mnuclia\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39msdk\u001b[39;00m \u001b[39mimport\u001b[39;00m NucliaAuth\n\u001b[1;32m 40\u001b[0m \u001b[39mexcept\u001b[39;00m \u001b[39mImportError\u001b[39;00m:\n", | ||
"\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'nuclia'", | ||
"\nDuring handling of the above exception, another exception occurred:\n", | ||
"\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)", | ||
"Cell \u001b[0;32mIn[1], line 4\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39mlangchain\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mvectorstores\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mnucliadb\u001b[39;00m \u001b[39mimport\u001b[39;00m NucliaDB\n\u001b[1;32m 2\u001b[0m API_KEY \u001b[39m=\u001b[39m \u001b[39m\"\u001b[39m\u001b[39mYOUR_API_KEY\u001b[39m\u001b[39m\"\u001b[39m\n\u001b[0;32m----> 4\u001b[0m ndb \u001b[39m=\u001b[39m NucliaDB(knowledge_box\u001b[39m=\u001b[39;49m\u001b[39m\"\u001b[39;49m\u001b[39mYOUR_KB_ID\u001b[39;49m\u001b[39m\"\u001b[39;49m, local\u001b[39m=\u001b[39;49m\u001b[39mFalse\u001b[39;49;00m, api_key\u001b[39m=\u001b[39;49mAPI_KEY)\n", | ||
"File \u001b[0;32m~/dev/osprojects/langchain/libs/langchain/langchain/vectorstores/nucliadb.py:41\u001b[0m, in \u001b[0;36mNucliaDB.__init__\u001b[0;34m(self, knowledge_box, local, api_key, backend)\u001b[0m\n\u001b[1;32m 39\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39mnuclia\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39msdk\u001b[39;00m \u001b[39mimport\u001b[39;00m NucliaAuth\n\u001b[1;32m 40\u001b[0m \u001b[39mexcept\u001b[39;00m \u001b[39mImportError\u001b[39;00m:\n\u001b[0;32m---> 41\u001b[0m \u001b[39mraise\u001b[39;00m \u001b[39mValueError\u001b[39;00m(\n\u001b[1;32m 42\u001b[0m \u001b[39m\"\u001b[39m\u001b[39mnuclia python package not found. \u001b[39m\u001b[39m\"\u001b[39m\n\u001b[1;32m 43\u001b[0m \u001b[39m\"\u001b[39m\u001b[39mPlease install it with `pip install nuclia`.\u001b[39m\u001b[39m\"\u001b[39m\n\u001b[1;32m 44\u001b[0m )\n\u001b[1;32m 45\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_config[\u001b[39m\"\u001b[39m\u001b[39mLOCAL\u001b[39m\u001b[39m\"\u001b[39m] \u001b[39m=\u001b[39m local\n\u001b[1;32m 46\u001b[0m zone \u001b[39m=\u001b[39m os\u001b[39m.\u001b[39menviron\u001b[39m.\u001b[39mget(\u001b[39m\"\u001b[39m\u001b[39mNUCLIA_ZONE\u001b[39m\u001b[39m\"\u001b[39m, \u001b[39m\"\u001b[39m\u001b[39meurope-1\u001b[39m\u001b[39m\"\u001b[39m)\n", | ||
"\u001b[0;31mValueError\u001b[0m: nuclia python package not found. Please install it with `pip install nuclia`." | ||
] | ||
} | ||
], | ||
"source": [ | ||
"from langchain.vectorstores.nucliadb import NucliaDB\n", | ||
"API_KEY = \"YOUR_API_KEY\"\n", | ||
"\n", | ||
"ndb = NucliaDB(knowledge_box=\"YOUR_KB_ID\", local=False, api_key=API_KEY)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"## Usage with a local instance\n", | ||
"\n", | ||
"Note: By default `backend` is set to `http://localhost:8080`." | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"from langchain.vectorstores.nucliadb import NucliaDB\n", | ||
"\n", | ||
"ndb = NucliaDB(knowledge_box=\"YOUR_KB_ID\", local=True, backend=\"http://my-local-server\")" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"## Add and delete texts to your Knowledge Box" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"ids = ndb.add_texts([\"This is a new test\", \"This is a second test\"])" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"ndb.delete(ids=ids)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"## Search in your Knowledge Box" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"results = ndb.similarity_search(\"Who was inspired by Ada Lovelace?\")\n", | ||
"print(res.page_content)" | ||
] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "langchain", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.10.5" | ||
}, | ||
"orig_nbformat": 4 | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 2 | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,159 @@ | ||
import os | ||
from typing import Any, Dict, Iterable, List, Optional, Type | ||
|
||
from langchain.embeddings.base import Embeddings | ||
from langchain.schema.document import Document | ||
from langchain.vectorstores.base import VST, VectorStore | ||
|
||
FIELD_TYPES = { | ||
"f": "files", | ||
"t": "texts", | ||
"l": "links", | ||
} | ||
|
||
|
||
class NucliaDB(VectorStore): | ||
"""NucliaDB vector store.""" | ||
|
||
_config: Dict[str, Any] = {} | ||
|
||
def __init__( | ||
self, | ||
knowledge_box: str, | ||
local: bool, | ||
api_key: Optional[str] = None, | ||
backend: Optional[str] = None, | ||
) -> None: | ||
"""Initialize the NucliaDB client. | ||
Args: | ||
knowledge_box: the Knowledge Box id. | ||
local: Whether to use a local NucliaDB instance or Nuclia Cloud | ||
api_key: A contributor API key for the kb (needed when local is False) | ||
backend: The backend url to use when local is True, defaults to | ||
http://localhost:8080 | ||
""" | ||
try: | ||
from nuclia.sdk import NucliaAuth | ||
except ImportError: | ||
raise ValueError( | ||
"nuclia python package not found. " | ||
"Please install it with `pip install nuclia`." | ||
) | ||
self._config["LOCAL"] = local | ||
zone = os.environ.get("NUCLIA_ZONE", "europe-1") | ||
self._kb = knowledge_box | ||
if local: | ||
if not backend: | ||
backend = "http://localhost:8080" | ||
self._config["BACKEND"] = f"{backend}/api/v1" | ||
self._config["TOKEN"] = None | ||
NucliaAuth().nucliadb(url=backend) | ||
NucliaAuth().kb(url=self.kb_url, interactive=False) | ||
else: | ||
self._config["BACKEND"] = f"https://{zone}.nuclia.cloud/api/v1" | ||
self._config["TOKEN"] = api_key | ||
NucliaAuth().kb( | ||
url=self.kb_url, token=self._config["TOKEN"], interactive=False | ||
) | ||
|
||
@property | ||
def is_local(self) -> str: | ||
return self._config["LOCAL"] | ||
|
||
@property | ||
def kb_url(self) -> str: | ||
return f"{self._config['BACKEND']}/kb/{self._kb}" | ||
|
||
def add_texts( | ||
self, | ||
texts: Iterable[str], | ||
metadatas: Optional[List[dict]] = None, | ||
**kwargs: Any, | ||
) -> List[str]: | ||
"""Upload texts to NucliaDB""" | ||
ids = [] | ||
from nuclia.sdk import NucliaResource | ||
|
||
factory = NucliaResource() | ||
for i, text in enumerate(texts): | ||
extra: Dict[str, Any] = {"metadata": ""} | ||
if metadatas: | ||
extra = {"metadata": metadatas[i]} | ||
id = factory.create( | ||
texts={"text": {"body": text}}, | ||
extra=extra, | ||
url=self.kb_url, | ||
api_key=self._config["TOKEN"], | ||
) | ||
ids.append(id) | ||
return ids | ||
|
||
def delete(self, ids: Optional[List[str]] = None, **kwargs: Any) -> Optional[bool]: | ||
if not ids: | ||
return None | ||
from nuclia.sdk import NucliaResource | ||
|
||
factory = NucliaResource() | ||
results: List[bool] = [] | ||
for id in ids: | ||
try: | ||
factory.delete(rid=id, url=self.kb_url, api_key=self._config["TOKEN"]) | ||
results.append(True) | ||
except ValueError: | ||
results.append(False) | ||
return all(results) | ||
|
||
def similarity_search( | ||
self, query: str, k: int = 4, **kwargs: Any | ||
) -> List[Document]: | ||
from nuclia.sdk import NucliaSearch | ||
from nucliadb_models.search import FindRequest, ResourceProperties | ||
|
||
request = FindRequest( | ||
query=query, | ||
page_size=k, | ||
show=[ResourceProperties.VALUES, ResourceProperties.EXTRA], | ||
) | ||
search = NucliaSearch() | ||
results = search.find( | ||
query=request, url=self.kb_url, api_key=self._config["TOKEN"] | ||
) | ||
paragraphs = [] | ||
for resource in results.resources.values(): | ||
for field in resource.fields.values(): | ||
for paragraph_id, paragraph in field.paragraphs.items(): | ||
info = paragraph_id.split("/") | ||
field_type = FIELD_TYPES.get(info[1], None) | ||
field_id = info[2] | ||
if not field_type: | ||
continue | ||
value = getattr(resource.data, field_type, {}).get(field_id, None) | ||
paragraphs.append( | ||
{ | ||
"text": paragraph.text, | ||
"metadata": { | ||
"extra": getattr( | ||
getattr(resource, "extra", {}), "metadata", None | ||
), | ||
"value": value, | ||
}, | ||
"order": paragraph.order, | ||
} | ||
) | ||
sorted_paragraphs = sorted(paragraphs, key=lambda x: x["order"]) | ||
return [ | ||
Document(page_content=paragraph["text"], metadata=paragraph["metadata"]) | ||
for paragraph in sorted_paragraphs | ||
] | ||
|
||
@classmethod | ||
def from_texts( | ||
cls: Type[VST], | ||
texts: List[str], | ||
embedding: Embeddings, | ||
metadatas: Optional[List[dict]] = None, | ||
**kwargs: Any, | ||
) -> VST: | ||
"""Return VectorStore initialized from texts and embeddings.""" | ||
raise NotImplementedError |
Oops, something went wrong.