From 5a59521d43700bf5616b325a6cecf25ab65375d8 Mon Sep 17 00:00:00 2001 From: Sharan Shirodkar <91109427+sharanshirodkar7@users.noreply.github.com> Date: Wed, 28 Aug 2024 09:13:24 -0400 Subject: [PATCH] adding lancedb to langchain vectorstores (#291) * adding lancedb to langchain vectorstores Signed-off-by: sharanshirodkar7 * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Signed-off-by: sharanshirodkar7 Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: lvliang-intel --- .../vectorstores/langchain/lancedb/README.md | 139 ++++++++++++++++++ 1 file changed, 139 insertions(+) create mode 100644 comps/vectorstores/langchain/lancedb/README.md diff --git a/comps/vectorstores/langchain/lancedb/README.md b/comps/vectorstores/langchain/lancedb/README.md new file mode 100644 index 0000000000..bfe01585c2 --- /dev/null +++ b/comps/vectorstores/langchain/lancedb/README.md @@ -0,0 +1,139 @@ +# LanceDB + +LanceDB is an embedded vector database for AI applications. It is open source and distributed with an Apache-2.0 license. + +LanceDB datasets are persisted to disk and can be shared in Python. + +## Setup + +```bash +npm install -S vectordb +``` + +## Usage + +### Create a new index from texts + +```python +import os +import tempfile +from langchain.vectorstores import LanceDB +from langchain.embeddings.openai import OpenAIEmbeddings +from vectordb import connect + + +async def run(): + dir = tempfile.mkdtemp(prefix="lancedb-") + db = await connect(dir) + table = await db.create_table("vectors", [{"vector": [0] * 1536, "text": "sample", "id": 1}]) + + vector_store = await LanceDB.from_texts( + ["Hello world", "Bye bye", "hello nice world"], + [{"id": 2}, {"id": 1}, {"id": 3}], + OpenAIEmbeddings(), + table=table, + ) + + result_one = await vector_store.similarity_search("hello world", 1) + print(result_one) + # [ Document(page_content='hello nice world', metadata={'id': 3}) ] + + +# Run the function +import asyncio + +asyncio.run(run()) +``` + +API Reference: + +- `LanceDB` from `@langchain/community/vectorstores/lancedb` +- `OpenAIEmbeddings` from `@langchain/openai` + +### Create a new index from a loader + +```python +import os +import tempfile +from langchain.vectorstores import LanceDB +from langchain.embeddings.openai import OpenAIEmbeddings +from langchain.document_loaders.fs import TextLoader +from vectordb import connect + +# Create docs with a loader +loader = TextLoader("src/document_loaders/example_data/example.txt") +docs = loader.load() + + +async def run(): + dir = tempfile.mkdtemp(prefix="lancedb-") + db = await connect(dir) + table = await db.create_table("vectors", [{"vector": [0] * 1536, "text": "sample", "source": "a"}]) + + vector_store = await LanceDB.from_documents(docs, OpenAIEmbeddings(), table=table) + + result_one = await vector_store.similarity_search("hello world", 1) + print(result_one) + # [ + # Document(page_content='Foo\nBar\nBaz\n\n', metadata={'source': 'src/document_loaders/example_data/example.txt'}) + # ] + + +# Run the function +import asyncio + +asyncio.run(run()) +``` + +API Reference: + +- `LanceDB` from `@langchain/community/vectorstores/lancedb` +- `OpenAIEmbeddings` from `@langchain/openai` +- `TextLoader` from `langchain/document_loaders/fs/text` + +### Open an existing dataset + +```python +import os +import tempfile +from langchain.vectorstores import LanceDB +from langchain.embeddings.openai import OpenAIEmbeddings +from vectordb import connect + + +async def run(): + uri = await create_test_db() + db = await connect(uri) + table = await db.open_table("vectors") + + vector_store = LanceDB(OpenAIEmbeddings(), table=table) + + result_one = await vector_store.similarity_search("hello world", 1) + print(result_one) + # [ Document(page_content='Hello world', metadata={'id': 1}) ] + + +async def create_test_db(): + dir = tempfile.mkdtemp(prefix="lancedb-") + db = await connect(dir) + await db.create_table( + "vectors", + [ + {"vector": [0] * 1536, "text": "Hello world", "id": 1}, + {"vector": [0] * 1536, "text": "Bye bye", "id": 2}, + {"vector": [0] * 1536, "text": "hello nice world", "id": 3}, + ], + ) + return dir + + +# Run the function +import asyncio + +asyncio.run(run()) +``` + +API Reference: + +- `LanceDB` from `@langchain/community/vectorstores/lancedb` +- `OpenAIEmbeddings` from `@langchain/openai`