Skip to content

Commit

Permalink
feat: add multi-tenancy support(#5)
Browse files Browse the repository at this point in the history
  • Loading branch information
gusye1234 committed Nov 11, 2024
1 parent c08a3c3 commit 0b26df1
Show file tree
Hide file tree
Showing 4 changed files with 184 additions and 3 deletions.
4 changes: 2 additions & 2 deletions nano_vectordb/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from .dbs import NanoVectorDB
from .dbs import NanoVectorDB, MultiTenantNanoVDB

__version__ = "0.0.4.2"
__version__ = "0.0.4.dev"
__author__ = "Jianbai Ye"
__url__ = "https://github.com/gusye1234/nano-vectordb"
84 changes: 84 additions & 0 deletions nano_vectordb/dbs.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,11 @@
import json
import base64
import hashlib
from uuid import uuid4
import numpy as np
from typing import TypedDict, Literal, Union, Callable
from dataclasses import dataclass, asdict
import sqlite3
import logging
from logging import getLogger

Expand Down Expand Up @@ -78,6 +80,12 @@ def __post_init__(self):
self.pre_process()
logger.info(f"Init {asdict(self)} {len(self.__storage['data'])} data")

def get_additional_data(self):
return self.__storage.get("additional_data", {})

def store_additional_data(self, **kwargs):
self.__storage["additional_data"] = kwargs

def upsert(self, datas: list[Data]):
_index_datas = {
data.get(f_ID, hash_ndarray(data[f_VECTOR])): data for data in datas
Expand Down Expand Up @@ -181,3 +189,79 @@ def _cosine_query(
break
results.append({**self.__storage["data"][abs_i], f_METRICS: scores[rel_i]})
return results


@dataclass
class MultiTenantNanoVDB:
embedding_dim: int
metric: Literal["cosine"] = "cosine"
max_capacity: int = 1000
storage_dir: str = "./nano_multi_tenant_storage"

@staticmethod
def jsonfile_from_id(tenant_id):
return f"nanovdb_{tenant_id}.json"

def __post_init__(self):
if self.max_capacity < 1:
raise ValueError("max_capacity should be greater than 0")
self.__storage: dict[str, NanoVectorDB] = {}
self.__cache_queue: list[str] = []

def contain_tenant(self, tenant_id: str) -> bool:
return tenant_id in self.__storage or os.path.exists(
f"{self.storage_dir}/{self.jsonfile_from_id(tenant_id)}"
)

def __load_tenant_in_cache(
self, tenant_id: str, in_memory_tenant: NanoVectorDB
) -> NanoVectorDB:
print(len(self.__storage), self.max_capacity)
if len(self.__storage) >= self.max_capacity:
vdb = self.__storage.pop(self.__cache_queue.pop(0))
if not os.path.exists(self.storage_dir):
os.makedirs(self.storage_dir)
vdb.save()
self.__storage[tenant_id] = in_memory_tenant
self.__cache_queue.append(tenant_id)
pass

def __load_tenant(self, tenant_id: str) -> NanoVectorDB:
if tenant_id in self.__storage:
return self.__storage[tenant_id]
if not self.contain_tenant(tenant_id):
raise ValueError(f"Tenant {tenant_id} not in storage")

in_memory_tenant = NanoVectorDB(
self.embedding_dim,
metric=self.metric,
storage_file=f"{self.storage_dir}/{self.jsonfile_from_id(tenant_id)}",
)
self.__load_tenant_in_cache(tenant_id, in_memory_tenant)
return in_memory_tenant

def create_tenant(self) -> str:
tenant_id = str(uuid4())
in_memory_tenant = NanoVectorDB(
self.embedding_dim,
metric=self.metric,
storage_file=f"{self.storage_dir}/{self.jsonfile_from_id(tenant_id)}",
)
self.__load_tenant_in_cache(tenant_id, in_memory_tenant)
return tenant_id

def delete_tenant(self, tenant_id: str):
if tenant_id in self.__storage:
self.__storage.pop(tenant_id)
self.__cache_queue.remove(tenant_id)
if os.path.exists(f"{self.storage_dir}/{self.jsonfile_from_id(tenant_id)}"):
os.remove(f"{self.storage_dir}/{self.jsonfile_from_id(tenant_id)}")

def get_tenant(self, tenant_id: str) -> NanoVectorDB:
return self.__load_tenant(tenant_id)

def save(self):
if not os.path.exists(self.storage_dir):
os.makedirs(self.storage_dir)
for db in self.__storage.values():
db.save()
39 changes: 39 additions & 0 deletions readme.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@

🏃 It's okay for your prototypes, maybe even more.

🏃 Support naive [multi-tenancy](#Multi-Tenancy).



## Install
Expand Down Expand Up @@ -107,6 +109,43 @@ print(vdb.get(r["insert"]))
vdb.delete(r["insert"])
```

### Additional Data

```python
vdb.store_additional_data(a=1, b=2, c=3)
print(vdb.get_additional_data())
```



## Multi-Tenancy (beta)

If you have multiple vectorDB to use, you can use `MultiTenantNanoVDB` to manage:

```python
from nano_vectordb import NanoVectorDB, MultiTenantNanoVDB

multi_tenant = MultiTenantNanoVDB(1024)
tenant_id = multi_tenant.create_tenant()

# tenant is a NanoVectorDB, you can upsert, query, get... on this.
tenant: NanoVectorDB = multi_tenant.get_tenant(tenant_id)

# some chores:
multi_tenant.delete_tenant(tenant_id)
multi_tenant.contain_tenant(tenant_id)

# save it
multi_tenant.save()
```

`MultiTenantNanoVDB` use a queue to manage the total vector dbs in memory, you can adjust the parameter:

```python
# There will be only `max_capacity` NanoVectorDB in the memory.
multi_tenant = MultiTenantNanoVDB(1024, max_capacity=1)
```



## Benchmark
Expand Down
60 changes: 59 additions & 1 deletion tests/test_init.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import os
import pytest
import numpy as np
from nano_vectordb import NanoVectorDB
from nano_vectordb import NanoVectorDB, MultiTenantNanoVDB
from nano_vectordb.dbs import f_METRICS, f_ID, f_VECTOR


Expand Down Expand Up @@ -101,3 +102,60 @@ def test_cond_filter():

r = a.query(query_data, 10, filter_lambda=cond_filer)
assert r[0][f_ID] == 1


def test_additonal_data():
data_len = 10
fake_dim = 1024

a = NanoVectorDB(fake_dim)

a.store_additional_data(a=1, b=2, c=3)
a.save()

a = NanoVectorDB(fake_dim)
assert a.get_additional_data() == {"a": 1, "b": 2, "c": 3}
os.remove("nano-vectordb.json")


def remove_non_empty_dir(dir_path):
for f in os.listdir(dir_path):
os.remove(os.path.join(dir_path, f))
os.rmdir(dir_path)


def test_multi_tenant():
with pytest.raises(ValueError):
multi_tenant = MultiTenantNanoVDB(1024, max_capacity=0)

multi_tenant = MultiTenantNanoVDB(1024)
tenant_id = multi_tenant.create_tenant()
tenant = multi_tenant.get_tenant(tenant_id)

tenant.store_additional_data(a=1, b=2, c=3)
multi_tenant.save()

multi_tenant = MultiTenantNanoVDB(1024)
assert multi_tenant.contain_tenant(tenant_id)
tenant = multi_tenant.get_tenant(tenant_id)
assert tenant.get_additional_data() == {"a": 1, "b": 2, "c": 3}

with pytest.raises(ValueError):
multi_tenant.get_tenant("1") # not a uuid

multi_tenant = MultiTenantNanoVDB(1024, max_capacity=1)
multi_tenant.create_tenant()
multi_tenant.get_tenant(tenant_id)

multi_tenant.delete_tenant(tenant_id)

multi_tenant = MultiTenantNanoVDB(1024)
assert not multi_tenant.contain_tenant(tenant_id)
remove_non_empty_dir("nano_multi_tenant_storage")

multi_tenant = MultiTenantNanoVDB(1024, max_capacity=1)
multi_tenant.create_tenant()
assert not os.path.exists("nano_multi_tenant_storage")
multi_tenant.create_tenant()
assert os.path.exists("nano_multi_tenant_storage")
remove_non_empty_dir("nano_multi_tenant_storage")

0 comments on commit 0b26df1

Please sign in to comment.