From b929519eab220933e86712cf68b183a5f58c5569 Mon Sep 17 00:00:00 2001 From: ouguoyu Date: Wed, 4 Dec 2024 11:07:06 +0800 Subject: [PATCH 01/14] add sparse embedding support for milvus --- examples/rag_milvus_store.py | 6 +-- lazyllm/tools/rag/__init__.py | 2 + lazyllm/tools/rag/chroma_store.py | 7 +-- lazyllm/tools/rag/data_type.py | 8 ++++ lazyllm/tools/rag/doc_impl.py | 29 ++++++++---- lazyllm/tools/rag/global_metadata.py | 6 +-- lazyllm/tools/rag/milvus_store.py | 37 ++++++++++----- lazyllm/tools/rag/utils.py | 34 ++++++++++++++ tests/basic_tests/test_store.py | 68 ++++++++++++++++++++++++++-- 9 files changed, 160 insertions(+), 37 deletions(-) create mode 100644 lazyllm/tools/rag/data_type.py diff --git a/examples/rag_milvus_store.py b/examples/rag_milvus_store.py index 8bce49ea..231518d4 100644 --- a/examples/rag_milvus_store.py +++ b/examples/rag_milvus_store.py @@ -3,7 +3,7 @@ import os import lazyllm from lazyllm import bind, config -from lazyllm.tools.rag import DocField +from lazyllm.tools.rag import DocField, DataType import shutil class TmpDir: @@ -28,8 +28,8 @@ def __del__(self): } doc_fields = { - 'comment': DocField(data_type=DocField.DTYPE_VARCHAR, max_size=65535, default_value=' '), - 'signature': DocField(data_type=DocField.DTYPE_VARCHAR, max_size=32, default_value=' '), + 'comment': DocField(data_type=DataType.VARCHAR, max_size=65535, default_value=' '), + 'signature': DocField(data_type=DataType.VARCHAR, max_size=32, default_value=' '), } prompt = 'You will play the role of an AI Q&A assistant and complete a dialogue task.'\ diff --git a/lazyllm/tools/rag/__init__.py b/lazyllm/tools/rag/__init__.py index 4ff1a9e4..48810066 100644 --- a/lazyllm/tools/rag/__init__.py +++ b/lazyllm/tools/rag/__init__.py @@ -9,6 +9,7 @@ from .dataReader import SimpleDirectoryReader from .doc_manager import DocManager, DocListManager from .global_metadata import GlobalMetadataDesc as DocField +from .data_type import DataType __all__ = [ @@ -39,4 +40,5 @@ 'DocManager', 'DocListManager', 'DocField', + 'DataType', ] diff --git a/lazyllm/tools/rag/chroma_store.py b/lazyllm/tools/rag/chroma_store.py index 4f4a1a86..16bb7fed 100644 --- a/lazyllm/tools/rag/chroma_store.py +++ b/lazyllm/tools/rag/chroma_store.py @@ -5,7 +5,7 @@ from .store_base import StoreBase, LAZY_ROOT_NAME from .doc_node import DocNode from .index_base import IndexBase -from .utils import _FileNodeIndex +from .utils import _FileNodeIndex, sparse2normal from .default_index import DefaultIndex from .map_store import MapStore @@ -151,10 +151,7 @@ def _build_nodes_from_chroma(self, results: Dict[str, List], embed_dims: Dict[st dim = embed_dims.get(key) if not dim: raise ValueError(f'dim of embed [{key}] is not determined.') - new_embedding = [0] * dim - for idx, val in embedding.items(): - new_embedding[int(idx)] = val - new_embedding_dict[key] = new_embedding + new_embedding_dict[key] = sparse2normal(embedding, dim) else: new_embedding_dict[key] = embedding node.embedding = new_embedding_dict diff --git a/lazyllm/tools/rag/data_type.py b/lazyllm/tools/rag/data_type.py new file mode 100644 index 00000000..5ed32415 --- /dev/null +++ b/lazyllm/tools/rag/data_type.py @@ -0,0 +1,8 @@ +from enum import IntEnum + +class DataType(IntEnum): + VARCHAR = 0 + ARRAY = 1 + INT32 = 2 + FLOAT_VECTOR = 3 + SPARSE_FLOAT_VECTOR = 4 diff --git a/lazyllm/tools/rag/doc_impl.py b/lazyllm/tools/rag/doc_impl.py index 5065fe36..2d1e0ddd 100644 --- a/lazyllm/tools/rag/doc_impl.py +++ b/lazyllm/tools/rag/doc_impl.py @@ -13,8 +13,9 @@ from .smart_embedding_index import SmartEmbeddingIndex from .doc_node import DocNode from .data_loaders import DirectoryReader -from .utils import DocListManager, gen_docid +from .utils import DocListManager, gen_docid, is_sparse from .global_metadata import GlobalMetadataDesc, RAG_DOC_ID, RAG_DOC_PATH +from .data_type import DataType import threading import time @@ -49,7 +50,6 @@ def __init__(self, embed: Dict[str, Callable], dlm: Optional[DocListManager] = N self._reader = DirectoryReader(None, self._local_file_reader, DocImpl._registered_file_reader) self.node_groups: Dict[str, Dict] = {LAZY_ROOT_NAME: {}} self.embed = {k: embed_wrapper(e) for k, e in embed.items()} - self._embed_dims = None self._global_metadata_desc = global_metadata_desc self.store = store_conf # NOTE: will be initialized in _lazy_init() self._activated_embeddings = {} @@ -65,7 +65,15 @@ def _lazy_init(self) -> None: for group in node_groups.keys(): self._activated_embeddings.setdefault(group, set()) - self._embed_dims = {k: len(e('a')) for k, e in self.embed.items()} + embed_dims = {} + embed_datatypes = {} + for k, e in self.embed.items(): + embedding = e('a') + if is_sparse(embedding): + embed_datatypes[k] = DataType.SPARSE_FLOAT_VECTOR + else: + embed_dims[k] = len(embedding) + embed_datatypes[k] = DataType.FLOAT_VECTOR if self.store is None: self.store = { @@ -73,7 +81,8 @@ def _lazy_init(self) -> None: } if isinstance(self.store, Dict): - self.store = self._create_store(self.store) + self.store = self._create_store(store_conf=self.store, embed_dims=embed_dims, + embed_datatypes=embed_datatypes) else: raise ValueError(f'store type [{type(self.store)}] is not a dict.') @@ -95,7 +104,8 @@ def _lazy_init(self) -> None: self._daemon.daemon = True self._daemon.start() - def _create_store(self, store_conf: Optional[Dict]) -> StoreBase: + def _create_store(self, store_conf: Optional[Dict], embed_dims: Optional[Dict[str, int]] = None, + embed_datatypes: Optional[Dict[str, DataType]] = None) -> StoreBase: store_type = store_conf.get('type') if not store_type: raise ValueError('store type is not specified.') @@ -108,11 +118,11 @@ def _create_store(self, store_conf: Optional[Dict]) -> StoreBase: store = MapStore(node_groups=list(self._activated_embeddings.keys()), embed=self.embed, **kwargs) elif store_type == "chroma": store = ChromadbStore(group_embed_keys=self._activated_embeddings, embed=self.embed, - embed_dims=self._embed_dims, **kwargs) + embed_dims=embed_dims, **kwargs) elif store_type == "milvus": store = MilvusStore(group_embed_keys=self._activated_embeddings, embed=self.embed, - embed_dims=self._embed_dims, global_metadata_desc=self._global_metadata_desc, - **kwargs) + embed_dims=embed_dims, embed_datatypes=embed_datatypes, + global_metadata_desc=self._global_metadata_desc, **kwargs) else: raise NotImplementedError( f"Not implemented store type for {store_type}" @@ -131,7 +141,8 @@ def _create_store(self, store_conf: Optional[Dict]) -> StoreBase: index = SmartEmbeddingIndex(backend_type=backend_type, group_embed_keys=self._activated_embeddings, embed=self.embed, - embed_dims=self._embed_dims, + embed_dims=embed_dims, + embed_datatypes=embed_datatypes, global_metadata_desc=self._global_metadata_desc, **kwargs) else: diff --git a/lazyllm/tools/rag/global_metadata.py b/lazyllm/tools/rag/global_metadata.py index a5fe2652..1a392532 100644 --- a/lazyllm/tools/rag/global_metadata.py +++ b/lazyllm/tools/rag/global_metadata.py @@ -1,11 +1,7 @@ from typing import Optional, Any class GlobalMetadataDesc: - DTYPE_VARCHAR = 0 - DTYPE_ARRAY = 1 - DTYPE_INT32 = 2 - - # max_size MUST be set when data_type is DTYPE_VARCHAR or DTYPE_ARRAY + # max_size MUST be set when data_type is DataType.VARCHAR or DataType.ARRAY def __init__(self, data_type: int, element_type: Optional[int] = None, default_value: Optional[Any] = None, max_size: Optional[int] = None): self.data_type = data_type diff --git a/lazyllm/tools/rag/milvus_store.py b/lazyllm/tools/rag/milvus_store.py index bc2666c3..9622527a 100644 --- a/lazyllm/tools/rag/milvus_store.py +++ b/lazyllm/tools/rag/milvus_store.py @@ -7,6 +7,7 @@ from .index_base import IndexBase from .store_base import StoreBase from .global_metadata import GlobalMetadataDesc, RAG_DOC_PATH, RAG_DOC_ID +from .data_type import DataType from lazyllm.common import override, obj2str, str2obj class MilvusStore(StoreBase): @@ -38,9 +39,9 @@ def _def_constants(self) -> None: } self._builtin_global_metadata_desc = { - RAG_DOC_ID: GlobalMetadataDesc(data_type=GlobalMetadataDesc.DTYPE_VARCHAR, + RAG_DOC_ID: GlobalMetadataDesc(data_type=DataType.VARCHAR, default_value=' ', max_size=512), - RAG_DOC_PATH: GlobalMetadataDesc(data_type=GlobalMetadataDesc.DTYPE_VARCHAR, + RAG_DOC_PATH: GlobalMetadataDesc(data_type=DataType.VARCHAR, default_value=' ', max_size=65535), } @@ -48,10 +49,13 @@ def _def_constants(self) -> None: pymilvus.DataType.VARCHAR, pymilvus.DataType.ARRAY, pymilvus.DataType.INT32, + pymilvus.DataType.FLOAT_VECTOR, + pymilvus.DataType.SPARSE_FLOAT_VECTOR, ] def __init__(self, group_embed_keys: Dict[str, Set[str]], embed: Dict[str, Callable], # noqa C901 - embed_dims: Dict[str, int], global_metadata_desc: Dict[str, GlobalMetadataDesc], + embed_dims: Dict[str, int], embed_datatypes: Dict[str, DataType], + global_metadata_desc: Dict[str, GlobalMetadataDesc], uri: str, embedding_index_type: Optional[str] = None, embedding_metric_type: Optional[str] = None, **kwargs): self._def_constants() @@ -60,6 +64,11 @@ def __init__(self, group_embed_keys: Dict[str, Set[str]], embed: Dict[str, Calla self._embed = embed self._client = pymilvus.MilvusClient(uri=uri) + if embed_dims is None: + embed_dims = {} + if embed_datatypes is None: + embed_datatypes = {} + # XXX milvus 2.4.x doesn't support `default_value` # https://milvus.io/docs/product_faq.md#Does-Milvus-support-specifying-default-values-for-scalar-or-vector-fields if global_metadata_desc: @@ -85,26 +94,32 @@ def __init__(self, group_embed_keys: Dict[str, Set[str]], embed: Dict[str, Calla field_list.append(pymilvus.FieldSchema(name=key, **info)) for key in embed_keys: - dim = embed_dims.get(key) - if not dim: - raise ValueError(f'cannot find embedding dim of embed [{key}] in [{embed_dims}]') + datatype = embed_datatypes.get(key) + if not datatype: + raise ValueError(f'cannot find embedding datatype if embed [{key}] in [{embed_datatypes}]') + + field_kwargs = {} + dim = embed_dims.get(key) # can be empty if embedding is sparse + if dim: + field_kwargs['dim'] = dim field_name = self._gen_embedding_key(key) - field_list.append(pymilvus.FieldSchema(name=field_name, dtype=pymilvus.DataType.FLOAT_VECTOR, dim=dim)) + field_list.append(pymilvus.FieldSchema(name=field_name, dtype=self._type2milvus[datatype], + **field_kwargs)) index_params.add_index(field_name=field_name, index_type=embedding_index_type, metric_type=embedding_metric_type) if self._global_metadata_desc: for key, desc in self._global_metadata_desc.items(): - if desc.data_type == GlobalMetadataDesc.DTYPE_ARRAY: + if desc.data_type == DataType.ARRAY: if not desc.element_type: raise ValueError(f'Milvus field [{key}]: `element_type` is required when ' - '`data_type` is DTYPE_ARRAY.') + '`data_type` is ARRAY.') field_args = { 'element_type': self._type2milvus[desc.element_type], 'max_capacity': desc.max_size, } - elif desc.data_type == GlobalMetadataDesc.DTYPE_VARCHAR: + elif desc.data_type == DataType.VARCHAR: field_args = { 'max_length': desc.max_size, } @@ -236,7 +251,7 @@ def _construct_filter_expr(self, filters: Dict[str, Union[str, int, List, Set]]) key = self._gen_field_key(name) if (not isinstance(candidates, List)) and (not isinstance(candidates, Set)): candidates = list(candidates) - if desc.data_type == GlobalMetadataDesc.DTYPE_ARRAY: + if desc.data_type == DataType.ARRAY: # https://github.com/milvus-io/milvus/discussions/35279 # `array_contains_any` requires milvus >= 2.4.3 and is not supported in local(aka lite) mode. ret_str += f'array_contains_any({key}, {candidates}) and ' diff --git a/lazyllm/tools/rag/utils.py b/lazyllm/tools/rag/utils.py index ce450cad..f7e4aa2f 100644 --- a/lazyllm/tools/rag/utils.py +++ b/lazyllm/tools/rag/utils.py @@ -570,3 +570,37 @@ def generic_process_filters(nodes: List[DocNode], filters: Dict[str, Union[str, else: res.append(node) return res + +def sparse2normal(embedding: List[Union[Dict, Tuple]], dim: int) -> List[float]: + if not embedding: + return [] + + new_embedding = [0] * dim + if isinstance(embedding[0], dict): + for idx, val in embedding.items(): + new_embedding[int(idx)] = val + elif isinstance(embedding[0], tuple): + for pair in embedding: + new_embedding[int(pair[0])] = pair[1] + else: + raise TypeError(f'unsupported embedding datatype `{type(embedding[0])}`') + + return new_embedding + +def is_sparse(embedding: Union[Dict[int, float], List[Tuple[int, float]], List[float]]) -> bool: + if isinstance(embedding, dict): + return True + + if not isinstance(embedding, list): + raise TypeError(f'unsupported embedding type `{type(embedding)}`') + + if len(embedding) == 0: + raise ValueError('empty embedding type is not determined.') + + if isinstance(embedding[0], tuple): + return True + + if isinstance(embedding[0], float): + return False + + raise TypeError(f'unsupported embedding type `{type(embedding[0])}`') diff --git a/tests/basic_tests/test_store.py b/tests/basic_tests/test_store.py index 558ebaba..4530b45b 100644 --- a/tests/basic_tests/test_store.py +++ b/tests/basic_tests/test_store.py @@ -9,6 +9,7 @@ from lazyllm.tools.rag.chroma_store import ChromadbStore from lazyllm.tools.rag.milvus_store import MilvusStore from lazyllm.tools.rag.doc_node import DocNode +from lazyllm.tools.rag.data_type import DataType from lazyllm.tools.rag.global_metadata import GlobalMetadataDesc @@ -184,14 +185,14 @@ def test_group_others(self): self.assertEqual(self.store.is_group_active("group2"), False) @pytest.mark.skip_on_win -class TestMilvusStore(unittest.TestCase): +class TestMilvusStoreWithNormalEmbedding(unittest.TestCase): def setUp(self): self.mock_embed = { 'vec1': MagicMock(return_value=[1.0, 2.0, 3.0]), 'vec2': MagicMock(return_value=[400.0, 500.0, 600.0, 700.0, 800.0]), } self.global_metadata_desc = { - 'comment': GlobalMetadataDesc(data_type=GlobalMetadataDesc.DTYPE_VARCHAR, max_size=65535, default_value=' '), + 'comment': GlobalMetadataDesc(data_type=DataType.VARCHAR, max_size=65535, default_value=' '), } self.node_groups = [LAZY_ROOT_NAME, "group1", "group2"] @@ -207,8 +208,14 @@ def setUp(self): "vec1": 3, "vec2": 5, } + self.embed_datatypes = { + 'vec1': DataType.FLOAT_VECTOR, + 'vec2': DataType.FLOAT_VECTOR, + } + self.store = MilvusStore(group_embed_keys=self.group_embed_keys, embed=self.mock_embed, - embed_dims=self.embed_dims, global_metadata_desc=self.global_metadata_desc, + embed_dims=self.embed_dims, embed_datatypes=self.embed_datatypes, + global_metadata_desc=self.global_metadata_desc, uri=self.store_file) self.node1 = DocNode(uid="1", text="text1", group="group1", parent=None, @@ -281,7 +288,7 @@ def test_reload(self): del self.store self.store = MilvusStore(group_embed_keys=self.group_embed_keys, embed=self.mock_embed, embed_dims=self.embed_dims, global_metadata_desc=self.global_metadata_desc, - uri=self.store_file) + embed_datatypes=self.embed_datatypes, uri=self.store_file) nodes = self.store.get_nodes('group1') orig_nodes = [self.node1, self.node2, self.node3] @@ -300,3 +307,56 @@ def _test_query_with_array_filter(self): filters={'tags': [2]}) self.assertEqual(len(ret), 2) self.assertEqual(set([ret[0].uid, ret[1].uid]), set([self.node1.uid, self.node2.uid])) + + +@pytest.mark.skip_on_win +class TestMilvusStoreWithSparseEmbedding(unittest.TestCase): + def setUp(self): + self.mock_embed = { + 'vec1': MagicMock(return_value={0: 1.0, 1: 2.0, 2: 3.0}), + 'vec2': MagicMock(return_value={0: 400.0, 1: 500.0, 2: 600.0, 3: 700.0, 4: 800.0}), + } + self.global_metadata_desc = { + 'comment': GlobalMetadataDesc(data_type=DataType.VARCHAR, max_size=65535, default_value=' '), + } + + self.node_groups = [LAZY_ROOT_NAME, "group1", "group2"] + _, self.store_file = tempfile.mkstemp(suffix=".db") + + embed_keys = set(['vec1', 'vec2']) + self.group_embed_keys = { + LAZY_ROOT_NAME: embed_keys, + 'group1': embed_keys, + 'group2': embed_keys, + } + self.embed_datatypes = { + 'vec1': DataType.SPARSE_FLOAT_VECTOR, + 'vec2': DataType.SPARSE_FLOAT_VECTOR, + } + + self.store = MilvusStore(group_embed_keys=self.group_embed_keys, embed=self.mock_embed, + embed_dims=None, embed_datatypes=self.embed_datatypes, + global_metadata_desc=self.global_metadata_desc, + uri=self.store_file, embedding_index_type='SPARSE_INVERTED_INDEX', + embedding_metric_type='IP') + + self.node1 = DocNode(uid="1", text="text1", group="group1", parent=None, + embedding={"vec1": {0: 1.0, 1: 2.0, 2: 3.0}, + "vec2": {0: 400.0, 1: 500.0, 2: 600.0, 3: 700.0, 4: 800.0}}) + self.node2 = DocNode(uid="2", text="text2", group="group1", parent=None, + embedding={"vec1": {0: 8.0, 1: 9.0, 2: 10.0}, + "vec2": {0: 11.0, 1: 12.0, 2: 13.0, 3: 14.0, 4: 15.0}}) + + def tearDown(self): + os.remove(self.store_file) + + def test_sparse_embedding(self): + self.store.update_nodes([self.node1, self.node2]) + + ret = self.store.query(query='test', group_name='group1', embed_keys=['vec1'], topk=1) + self.assertEqual(len(ret), 1) + self.assertEqual(ret[0].uid, self.node2.uid) + + ret = self.store.query(query='test', group_name='group1', embed_keys=['vec2'], topk=1) + self.assertEqual(len(ret), 1) + self.assertEqual(ret[0].uid, self.node1.uid) From 9a2ded92a65cb80336cb1ff8274cd0d65d480a73 Mon Sep 17 00:00:00 2001 From: ouguoyu Date: Wed, 4 Dec 2024 15:42:15 +0800 Subject: [PATCH 02/14] review1 --- lazyllm/tools/rag/default_index.py | 9 ++++++++- lazyllm/tools/rag/utils.py | 8 ++++---- tests/basic_tests/test_rag_utils.py | 25 ++++++++++++++++++++++++- 3 files changed, 36 insertions(+), 6 deletions(-) diff --git a/lazyllm/tools/rag/default_index.py b/lazyllm/tools/rag/default_index.py index 6a4c3133..8d32485b 100644 --- a/lazyllm/tools/rag/default_index.py +++ b/lazyllm/tools/rag/default_index.py @@ -4,7 +4,7 @@ from .index_base import IndexBase from lazyllm import LOG from lazyllm.common import override -from .utils import parallel_do_embedding, generic_process_filters +from .utils import parallel_do_embedding, generic_process_filters, is_sparse from .similarity import registered_similarities # ---------------------------------------------------------------------------- # @@ -51,6 +51,7 @@ def query( if not embed_keys: embed_keys = list(self.embed.keys()) query_embedding = {k: self.embed[k](query) for k in embed_keys} + self._check_supported(similarity_name, query_embedding) modified_nodes = parallel_do_embedding(self.embed, embed_keys, nodes) self.store.update_nodes(modified_nodes) similarities = similarity_func(query_embedding, nodes, topk=topk, **kwargs) @@ -78,3 +79,9 @@ def _filter_nodes_by_score(self, similarities: List[Tuple[DocNode, float]], topk similarities = similarities[:topk] return [node for node, score in similarities if score > similarity_cut_off] + + def _check_supported(self, similarity_name: str, query_embedding: Dict[str, Any]) -> None: + if similarity_name.lower() == 'cosine': + for k, e in query_embedding.items(): + if is_sparse(e): + raise NotImplementedError(f'embed `{k}` which is sparse is not supported.') diff --git a/lazyllm/tools/rag/utils.py b/lazyllm/tools/rag/utils.py index f7e4aa2f..47e36786 100644 --- a/lazyllm/tools/rag/utils.py +++ b/lazyllm/tools/rag/utils.py @@ -571,15 +571,15 @@ def generic_process_filters(nodes: List[DocNode], filters: Dict[str, Union[str, res.append(node) return res -def sparse2normal(embedding: List[Union[Dict, Tuple]], dim: int) -> List[float]: +def sparse2normal(embedding: Union[Dict[int, float], List[Tuple[int, float]]], dim: int) -> List[float]: if not embedding: return [] new_embedding = [0] * dim - if isinstance(embedding[0], dict): + if isinstance(embedding, dict): for idx, val in embedding.items(): new_embedding[int(idx)] = val - elif isinstance(embedding[0], tuple): + elif isinstance(embedding, list) and isinstance(embedding[0], tuple): for pair in embedding: new_embedding[int(pair[0])] = pair[1] else: @@ -600,7 +600,7 @@ def is_sparse(embedding: Union[Dict[int, float], List[Tuple[int, float]], List[f if isinstance(embedding[0], tuple): return True - if isinstance(embedding[0], float): + if isinstance(embedding[0], float) or isinstance(embedding[0], int): return False raise TypeError(f'unsupported embedding type `{type(embedding[0])}`') diff --git a/tests/basic_tests/test_rag_utils.py b/tests/basic_tests/test_rag_utils.py index ce8c4283..e5eb6b67 100644 --- a/tests/basic_tests/test_rag_utils.py +++ b/tests/basic_tests/test_rag_utils.py @@ -1,6 +1,6 @@ from lazyllm.tools.rag.utils import generic_process_filters from lazyllm.tools.rag.doc_node import DocNode -from lazyllm.tools.rag.utils import _FileNodeIndex +from lazyllm.tools.rag.utils import _FileNodeIndex, sparse2normal, is_sparse from lazyllm.tools.rag.store_base import LAZY_ROOT_NAME from lazyllm.tools.rag.global_metadata import RAG_DOC_PATH import unittest @@ -24,6 +24,29 @@ def test_generic_process_filters(self): res = generic_process_filters(nodes, {'k2': 'v6'}) assert len(res) == 0 + def test_sparse2normal(self): + embedding = {1: 3, 5: 12} + dim = 6 + res = sparse2normal(embedding, dim) + assert len(res) == dim + assert res == [0, 3, 0, 0, 0, 12] + + embedding = [(0, 9), (2, 14), (4, 28)] + dim = 8 + res = sparse2normal(embedding, dim) + assert len(res) == dim + assert res == [9, 0, 14, 0, 28, 0, 0, 0] + + def test_is_sparse(self): + embedding = {1: 3, 5: 12} + assert is_sparse(embedding) + + embedding = [(0, 9), (2, 14), (4, 28)] + assert is_sparse(embedding) + + embedding = [9, 0, 14, 0, 28, 0, 0, 0] + assert not is_sparse(embedding) + class TestFileNodeIndex(unittest.TestCase): def setUp(self): self.index = _FileNodeIndex() From 7267752c68a478b981547d67f8e63c29f8dc72bf Mon Sep 17 00:00:00 2001 From: ouguoyu Date: Wed, 4 Dec 2024 16:00:01 +0800 Subject: [PATCH 03/14] minor changes --- lazyllm/tools/rag/default_index.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lazyllm/tools/rag/default_index.py b/lazyllm/tools/rag/default_index.py index 8d32485b..06bd731a 100644 --- a/lazyllm/tools/rag/default_index.py +++ b/lazyllm/tools/rag/default_index.py @@ -1,4 +1,4 @@ -from typing import List, Callable, Optional, Dict, Union, Tuple +from typing import List, Callable, Optional, Dict, Union, Tuple, Any from .doc_node import DocNode from .store_base import StoreBase from .index_base import IndexBase From 962bc35c8aaa716d0a9b4c818f3e246d0e7b220d Mon Sep 17 00:00:00 2001 From: ouguoyu Date: Thu, 5 Dec 2024 12:05:27 +0800 Subject: [PATCH 04/14] review3 --- lazyllm/tools/rag/default_index.py | 2 +- lazyllm/tools/rag/doc_impl.py | 9 ++++++--- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/lazyllm/tools/rag/default_index.py b/lazyllm/tools/rag/default_index.py index 06bd731a..77ee8381 100644 --- a/lazyllm/tools/rag/default_index.py +++ b/lazyllm/tools/rag/default_index.py @@ -84,4 +84,4 @@ def _check_supported(self, similarity_name: str, query_embedding: Dict[str, Any] if similarity_name.lower() == 'cosine': for k, e in query_embedding.items(): if is_sparse(e): - raise NotImplementedError(f'embed `{k}` which is sparse is not supported.') + raise NotImplementedError(f'embed `{k}`, which is sparse, is not supported.') diff --git a/lazyllm/tools/rag/doc_impl.py b/lazyllm/tools/rag/doc_impl.py index 2d1e0ddd..779cd786 100644 --- a/lazyllm/tools/rag/doc_impl.py +++ b/lazyllm/tools/rag/doc_impl.py @@ -334,9 +334,12 @@ def retrieve(self, query: str, group_name: str, similarity: str, similarity_cut_ if not index_instance: raise NotImplementedError(f"index type '{index}' is not supported currently.") - return index_instance.query(query=query, group_name=group_name, similarity_name=similarity, - similarity_cut_off=similarity_cut_off, topk=topk, - embed_keys=embed_keys, filters=filters, **similarity_kws) + try: + return index_instance.query(query=query, group_name=group_name, similarity_name=similarity, + similarity_cut_off=similarity_cut_off, topk=topk, + embed_keys=embed_keys, filters=filters, **similarity_kws) + except Exception as e: + raise RuntimeError(f'index type `{index}` of store `{type(self.store)}` query failed: {e}') @staticmethod def find_parent(nodes: List[DocNode], group: str) -> List[DocNode]: From c1efb6ae0a11550f43f4120f2a46d72b7b4dbada Mon Sep 17 00:00:00 2001 From: dorren Date: Wed, 11 Dec 2024 18:56:36 +0800 Subject: [PATCH 05/14] enable sparse & dense index --- examples/rag_map_store_with_milvus_index.py | 6 +++-- examples/rag_milvus_store.py | 6 +++-- lazyllm/tools/rag/milvus_store.py | 21 ++++++++------- tests/basic_tests/test_store.py | 30 ++++++++++++++++----- 4 files changed, 43 insertions(+), 20 deletions(-) diff --git a/examples/rag_map_store_with_milvus_index.py b/examples/rag_map_store_with_milvus_index.py index 50834dd3..c5b15794 100644 --- a/examples/rag_map_store_with_milvus_index.py +++ b/examples/rag_map_store_with_milvus_index.py @@ -15,8 +15,10 @@ def run(query): 'backend': 'milvus', 'kwargs': { 'uri': store_file, - 'embedding_index_type': 'HNSW', - 'embedding_metric_type': 'COSINE', + 'index_kwargs': { + 'index_type': 'HNSW', + 'metric_type': 'COSINE', + } }, }, }, diff --git a/examples/rag_milvus_store.py b/examples/rag_milvus_store.py index 231518d4..7252943c 100644 --- a/examples/rag_milvus_store.py +++ b/examples/rag_milvus_store.py @@ -22,8 +22,10 @@ def __del__(self): 'type': 'milvus', 'kwargs': { 'uri': tmp_dir.store_file, - 'embedding_index_type': 'HNSW', - 'embedding_metric_type': 'COSINE', + 'index_kwargs': { + 'index_type': 'HNSW', + 'metric_type': 'COSINE', + } }, } diff --git a/lazyllm/tools/rag/milvus_store.py b/lazyllm/tools/rag/milvus_store.py index 9622527a..947f9f9c 100644 --- a/lazyllm/tools/rag/milvus_store.py +++ b/lazyllm/tools/rag/milvus_store.py @@ -56,8 +56,7 @@ def _def_constants(self) -> None: def __init__(self, group_embed_keys: Dict[str, Set[str]], embed: Dict[str, Callable], # noqa C901 embed_dims: Dict[str, int], embed_datatypes: Dict[str, DataType], global_metadata_desc: Dict[str, GlobalMetadataDesc], - uri: str, embedding_index_type: Optional[str] = None, - embedding_metric_type: Optional[str] = None, **kwargs): + uri: str, **kwargs): self._def_constants() self._group_embed_keys = group_embed_keys @@ -76,11 +75,7 @@ def __init__(self, group_embed_keys: Dict[str, Set[str]], embed: Dict[str, Calla else: self._global_metadata_desc = self._builtin_global_metadata_desc - if not embedding_index_type: - embedding_index_type = 'HNSW' - - if not embedding_metric_type: - embedding_metric_type = 'COSINE' + index_kwargs_list = kwargs.get('index_kwargs', None) collections = self._client.list_collections() for group, embed_keys in group_embed_keys.items(): @@ -106,9 +101,15 @@ def __init__(self, group_embed_keys: Dict[str, Set[str]], embed: Dict[str, Calla field_name = self._gen_embedding_key(key) field_list.append(pymilvus.FieldSchema(name=field_name, dtype=self._type2milvus[datatype], **field_kwargs)) - index_params.add_index(field_name=field_name, index_type=embedding_index_type, - metric_type=embedding_metric_type) - + if isinstance(index_kwargs_list, list): + for index_kwargs in index_kwargs_list: + if index_kwargs.get("__embed_key__", None) == key: + index_kwarg = index_kwargs + index_kwarg.pop(key, None) + index_params.add_index(field_name=field_name, **index_kwarg) + elif isinstance(index_kwargs_list, dict): + index_params.add_index(field_name=field_name, **index_kwargs_list) + if self._global_metadata_desc: for key, desc in self._global_metadata_desc.items(): if desc.data_type == DataType.ARRAY: diff --git a/tests/basic_tests/test_store.py b/tests/basic_tests/test_store.py index 4530b45b..f1ff4c58 100644 --- a/tests/basic_tests/test_store.py +++ b/tests/basic_tests/test_store.py @@ -213,10 +213,17 @@ def setUp(self): 'vec2': DataType.FLOAT_VECTOR, } + self.kwargs = { + 'uri': self.store_file, + 'index_kwargs': { + 'index_type': 'HNSW', + 'metric_type': 'COSINE', + } + } + self.store = MilvusStore(group_embed_keys=self.group_embed_keys, embed=self.mock_embed, embed_dims=self.embed_dims, embed_datatypes=self.embed_datatypes, - global_metadata_desc=self.global_metadata_desc, - uri=self.store_file) + global_metadata_desc=self.global_metadata_desc, **self.kwargs) self.node1 = DocNode(uid="1", text="text1", group="group1", parent=None, embedding={"vec1": [8.0, 9.0, 10.0], "vec2": [11.0, 12.0, 13.0, 14.0, 15.0]}, @@ -288,7 +295,7 @@ def test_reload(self): del self.store self.store = MilvusStore(group_embed_keys=self.group_embed_keys, embed=self.mock_embed, embed_dims=self.embed_dims, global_metadata_desc=self.global_metadata_desc, - embed_datatypes=self.embed_datatypes, uri=self.store_file) + embed_datatypes=self.embed_datatypes, **self.kwargs) nodes = self.store.get_nodes('group1') orig_nodes = [self.node1, self.node2, self.node3] @@ -334,11 +341,22 @@ def setUp(self): 'vec2': DataType.SPARSE_FLOAT_VECTOR, } + self.kwargs = { + 'uri': self.store_file, + 'index_kwargs': [{ + '__embed_key__': 'vec1', + 'index_type': 'SPARSE_INVERTED_INDEX', + 'metric_type': 'IP', + },{ + '__embed_key__': 'vec2', + 'index_type': 'SPARSE_INVERTED_INDEX', + 'metric_type': 'IP', + }] + } + self.store = MilvusStore(group_embed_keys=self.group_embed_keys, embed=self.mock_embed, embed_dims=None, embed_datatypes=self.embed_datatypes, - global_metadata_desc=self.global_metadata_desc, - uri=self.store_file, embedding_index_type='SPARSE_INVERTED_INDEX', - embedding_metric_type='IP') + global_metadata_desc=self.global_metadata_desc, **self.kwargs) self.node1 = DocNode(uid="1", text="text1", group="group1", parent=None, embedding={"vec1": {0: 1.0, 1: 2.0, 2: 3.0}, From 5e5555c4b79d6373ec0eb24d3616a1e7a5e00782 Mon Sep 17 00:00:00 2001 From: dorren Date: Wed, 11 Dec 2024 19:06:05 +0800 Subject: [PATCH 06/14] alter default value --- lazyllm/tools/rag/milvus_store.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lazyllm/tools/rag/milvus_store.py b/lazyllm/tools/rag/milvus_store.py index 947f9f9c..c38c3edb 100644 --- a/lazyllm/tools/rag/milvus_store.py +++ b/lazyllm/tools/rag/milvus_store.py @@ -75,7 +75,7 @@ def __init__(self, group_embed_keys: Dict[str, Set[str]], embed: Dict[str, Calla else: self._global_metadata_desc = self._builtin_global_metadata_desc - index_kwargs_list = kwargs.get('index_kwargs', None) + index_kwargs_list = kwargs.get('index_kwargs', {}) collections = self._client.list_collections() for group, embed_keys in group_embed_keys.items(): From 3a532d6ae4850057cc95504cf3091f29a7eb8c13 Mon Sep 17 00:00:00 2001 From: ouguoyu Date: Thu, 12 Dec 2024 10:12:56 +0800 Subject: [PATCH 07/14] fixes --- lazyllm/tools/rag/milvus_store.py | 22 ++++++++++++---------- tests/basic_tests/test_store.py | 9 ++++++--- 2 files changed, 18 insertions(+), 13 deletions(-) diff --git a/lazyllm/tools/rag/milvus_store.py b/lazyllm/tools/rag/milvus_store.py index c38c3edb..5ae95fe0 100644 --- a/lazyllm/tools/rag/milvus_store.py +++ b/lazyllm/tools/rag/milvus_store.py @@ -56,7 +56,7 @@ def _def_constants(self) -> None: def __init__(self, group_embed_keys: Dict[str, Set[str]], embed: Dict[str, Callable], # noqa C901 embed_dims: Dict[str, int], embed_datatypes: Dict[str, DataType], global_metadata_desc: Dict[str, GlobalMetadataDesc], - uri: str, **kwargs): + uri: str, index_kwargs: Optional[Union[Dict, List]] = None): self._def_constants() self._group_embed_keys = group_embed_keys @@ -75,8 +75,6 @@ def __init__(self, group_embed_keys: Dict[str, Set[str]], embed: Dict[str, Calla else: self._global_metadata_desc = self._builtin_global_metadata_desc - index_kwargs_list = kwargs.get('index_kwargs', {}) - collections = self._client.list_collections() for group, embed_keys in group_embed_keys.items(): if group in collections: @@ -101,15 +99,19 @@ def __init__(self, group_embed_keys: Dict[str, Set[str]], embed: Dict[str, Calla field_name = self._gen_embedding_key(key) field_list.append(pymilvus.FieldSchema(name=field_name, dtype=self._type2milvus[datatype], **field_kwargs)) - if isinstance(index_kwargs_list, list): - for index_kwargs in index_kwargs_list: - if index_kwargs.get("__embed_key__", None) == key: - index_kwarg = index_kwargs + if isinstance(index_kwargs, list): + for item in index_kwargs: + item_key = item.get("__embed_key__", None) + if not item_key: + raise ValueError(f'cannot find `__embed_key__` in `index_kwargs` of `{field_name}`') + if item_key == key: + index_kwarg = item.copy() index_kwarg.pop(key, None) index_params.add_index(field_name=field_name, **index_kwarg) - elif isinstance(index_kwargs_list, dict): - index_params.add_index(field_name=field_name, **index_kwargs_list) - + break + elif isinstance(index_kwargs, dict): + index_params.add_index(field_name=field_name, **index_kwargs) + if self._global_metadata_desc: for key, desc in self._global_metadata_desc.items(): if desc.data_type == DataType.ARRAY: diff --git a/tests/basic_tests/test_store.py b/tests/basic_tests/test_store.py index f1ff4c58..b80b3158 100644 --- a/tests/basic_tests/test_store.py +++ b/tests/basic_tests/test_store.py @@ -343,15 +343,18 @@ def setUp(self): self.kwargs = { 'uri': self.store_file, - 'index_kwargs': [{ + 'index_kwargs': [ + { '__embed_key__': 'vec1', 'index_type': 'SPARSE_INVERTED_INDEX', 'metric_type': 'IP', - },{ + }, + { '__embed_key__': 'vec2', 'index_type': 'SPARSE_INVERTED_INDEX', 'metric_type': 'IP', - }] + } + ] } self.store = MilvusStore(group_embed_keys=self.group_embed_keys, embed=self.mock_embed, From b61ee3c7bc3a42f5b216d0c91d1c51729fb6fc07 Mon Sep 17 00:00:00 2001 From: ouguoyu Date: Mon, 16 Dec 2024 10:10:30 +0800 Subject: [PATCH 08/14] s --- lazyllm/tools/rag/milvus_store.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/lazyllm/tools/rag/milvus_store.py b/lazyllm/tools/rag/milvus_store.py index 38074a9b..54c29d30 100644 --- a/lazyllm/tools/rag/milvus_store.py +++ b/lazyllm/tools/rag/milvus_store.py @@ -100,13 +100,14 @@ def __init__(self, group_embed_keys: Dict[str, Set[str]], embed: Dict[str, Calla field_list.append(pymilvus.FieldSchema(name=field_name, dtype=self._type2milvus[datatype], **field_kwargs)) if isinstance(index_kwargs, list): + embed_key_field_name = "__embed_key__" for item in index_kwargs: - item_key = item.get("__embed_key__", None) + item_key = item.get(embed_key_field_name, None) if not item_key: - raise ValueError(f'cannot find `__embed_key__` in `index_kwargs` of `{field_name}`') + raise ValueError(f'cannot find `{embed_key_field_name}` in `index_kwargs` of `{field_name}`') if item_key == key: index_kwarg = item.copy() - index_kwarg.pop(key, None) + index_kwarg.pop(embed_key_field_name, None) index_params.add_index(field_name=field_name, **index_kwarg) break elif isinstance(index_kwargs, dict): From 93866e08d63fe6afac9c375c9a5505255f47ae8c Mon Sep 17 00:00:00 2001 From: ouguoyu Date: Mon, 16 Dec 2024 11:46:33 +0800 Subject: [PATCH 09/14] rename __embed__key__ => embed_key --- lazyllm/tools/rag/milvus_store.py | 2 +- tests/basic_tests/test_store.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/lazyllm/tools/rag/milvus_store.py b/lazyllm/tools/rag/milvus_store.py index 54c29d30..5a4ab571 100644 --- a/lazyllm/tools/rag/milvus_store.py +++ b/lazyllm/tools/rag/milvus_store.py @@ -100,7 +100,7 @@ def __init__(self, group_embed_keys: Dict[str, Set[str]], embed: Dict[str, Calla field_list.append(pymilvus.FieldSchema(name=field_name, dtype=self._type2milvus[datatype], **field_kwargs)) if isinstance(index_kwargs, list): - embed_key_field_name = "__embed_key__" + embed_key_field_name = "embed_key" for item in index_kwargs: item_key = item.get(embed_key_field_name, None) if not item_key: diff --git a/tests/basic_tests/test_store.py b/tests/basic_tests/test_store.py index 090d344d..865a10be 100644 --- a/tests/basic_tests/test_store.py +++ b/tests/basic_tests/test_store.py @@ -345,12 +345,12 @@ def setUp(self): 'uri': self.store_file, 'index_kwargs': [ { - '__embed_key__': 'vec1', + 'embed_key': 'vec1', 'index_type': 'SPARSE_INVERTED_INDEX', 'metric_type': 'IP', }, { - '__embed_key__': 'vec2', + 'embed_key': 'vec2', 'index_type': 'SPARSE_INVERTED_INDEX', 'metric_type': 'IP', } From fab1ae3877e245a158d3bf05b5a662f6d27d90eb Mon Sep 17 00:00:00 2001 From: ouguoyu Date: Mon, 16 Dec 2024 17:09:42 +0800 Subject: [PATCH 10/14] bugfix: modify root node's global metadata --- lazyllm/tools/rag/chroma_store.py | 2 +- lazyllm/tools/rag/doc_node.py | 4 ++++ lazyllm/tools/rag/milvus_store.py | 2 +- 3 files changed, 6 insertions(+), 2 deletions(-) diff --git a/lazyllm/tools/rag/chroma_store.py b/lazyllm/tools/rag/chroma_store.py index 99772997..5574f6e5 100644 --- a/lazyllm/tools/rag/chroma_store.py +++ b/lazyllm/tools/rag/chroma_store.py @@ -170,7 +170,7 @@ def _make_chroma_metadata(self, node: DocNode) -> Dict[str, Any]: "metadata": obj2str(node._metadata), } - if not node.parent: + if node.is_root_node: metadata["global_metadata"] = obj2str(node.global_metadata) return metadata diff --git a/lazyllm/tools/rag/doc_node.py b/lazyllm/tools/rag/doc_node.py index 97d097af..f56a6450 100644 --- a/lazyllm/tools/rag/doc_node.py +++ b/lazyllm/tools/rag/doc_node.py @@ -83,6 +83,10 @@ def root_node(self) -> Optional["DocNode"]: root = root.parent return root or self + @property + def is_root_node(self) -> bool: + return (not self.parent) + @property def global_metadata(self) -> Dict[str, Any]: return self.root_node._global_metadata diff --git a/lazyllm/tools/rag/milvus_store.py b/lazyllm/tools/rag/milvus_store.py index 62cb5565..12e90d99 100644 --- a/lazyllm/tools/rag/milvus_store.py +++ b/lazyllm/tools/rag/milvus_store.py @@ -280,7 +280,7 @@ def _deserialize_node_partial(self, result: Dict) -> DocNode: if k.startswith(self._embedding_key_prefix): doc.embedding[k[len(self._embedding_key_prefix):]] = v elif k.startswith(self._global_metadata_key_prefix): - if doc.parent: + if doc.is_root_node: doc._global_metadata[k[len(self._global_metadata_key_prefix):]] = v return doc From f037e827add5f795b5a8c6f6d4d35ab387a1312a Mon Sep 17 00:00:00 2001 From: ouguoyu Date: Mon, 16 Dec 2024 18:47:48 +0800 Subject: [PATCH 11/14] add ut --- tests/basic_tests/test_store.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/tests/basic_tests/test_store.py b/tests/basic_tests/test_store.py index 0d1d2517..9ae9c0c7 100644 --- a/tests/basic_tests/test_store.py +++ b/tests/basic_tests/test_store.py @@ -192,6 +192,9 @@ def setUp(self): } self.global_metadata_desc = { 'comment': GlobalMetadataDesc(data_type=GlobalMetadataDesc.DTYPE_VARCHAR, max_size=65535, default_value=' '), + 'signature': GlobalMetadataDesc(data_type=GlobalMetadataDesc.DTYPE_VARCHAR, max_size=256, default_value=' '), + 'tags': GlobalMetadataDesc(data_type=GlobalMetadataDesc.DTYPE_ARRAY, element_type=GlobalMetadataDesc.DTYPE_INT32, + max_size=128), } self.node_groups = [LAZY_ROOT_NAME, "group1", "group2"] @@ -278,6 +281,7 @@ def test_query_with_filter_non_exist(self): def test_reload(self): self.store.update_nodes([self.node1, self.node2, self.node3]) + # reload from storage del self.store self.store = MilvusStore(group_embed_keys=self.group_embed_keys, embed=self.mock_embed, embed_dims=self.embed_dims, global_metadata_desc=self.global_metadata_desc, @@ -291,6 +295,9 @@ def test_reload(self): for orig_node in orig_nodes: if node._uid == orig_node._uid: self.assertEqual(node.text, orig_node.text) + # builtin fields are not in orig node, so we can not use node.global_metadata == orig_node.global_metadata + for k, v in orig_node.global_metadata.items(): + self.assertEqual(node.global_metadata[k], v) break # XXX `array_contains_any` is not supported in local(aka lite) mode. skip this ut From 38da05324058d6fe90e14e21390c4c48286d5abd Mon Sep 17 00:00:00 2001 From: ouguoyu Date: Tue, 17 Dec 2024 10:14:22 +0800 Subject: [PATCH 12/14] test fix --- requirements.full.txt | 2 +- requirements.txt | 2 +- scripts/check_requirements.py | 2 +- tests/basic_tests/test_store.py | 2 +- tests/requirements.txt | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/requirements.full.txt b/requirements.full.txt index c3beac91..3fdf1b93 100644 --- a/requirements.full.txt +++ b/requirements.full.txt @@ -31,7 +31,7 @@ psutil pypdf pytest numpy==1.26.4 -pymilvus +pymilvus>=2.4.7, <2.5.0 async-timeout httpx<0.28.0 redis>=5.0.4 diff --git a/requirements.txt b/requirements.txt index 2544f32d..31a9f0bf 100644 --- a/requirements.txt +++ b/requirements.txt @@ -31,6 +31,6 @@ psutil pypdf pytest numpy==1.26.4 -pymilvus +pymilvus>=2.4.7, <2.5.0 async-timeout httpx<0.28.0 diff --git a/scripts/check_requirements.py b/scripts/check_requirements.py index 980cf05a..09187353 100644 --- a/scripts/check_requirements.py +++ b/scripts/check_requirements.py @@ -28,7 +28,7 @@ def parse_requirement(line): return None, None def compare_versions(version_spec, req_version): - if version_spec.startswith('^') and req_version == '*': + if version_spec.startswith('^'): return True return version_spec == req_version diff --git a/tests/basic_tests/test_store.py b/tests/basic_tests/test_store.py index 9ae9c0c7..f07f22d5 100644 --- a/tests/basic_tests/test_store.py +++ b/tests/basic_tests/test_store.py @@ -194,7 +194,7 @@ def setUp(self): 'comment': GlobalMetadataDesc(data_type=GlobalMetadataDesc.DTYPE_VARCHAR, max_size=65535, default_value=' '), 'signature': GlobalMetadataDesc(data_type=GlobalMetadataDesc.DTYPE_VARCHAR, max_size=256, default_value=' '), 'tags': GlobalMetadataDesc(data_type=GlobalMetadataDesc.DTYPE_ARRAY, element_type=GlobalMetadataDesc.DTYPE_INT32, - max_size=128), + max_size=128, default_value=[]), } self.node_groups = [LAZY_ROOT_NAME, "group1", "group2"] diff --git a/tests/requirements.txt b/tests/requirements.txt index 5a33e8db..30955a06 100644 --- a/tests/requirements.txt +++ b/tests/requirements.txt @@ -3,4 +3,4 @@ docx2txt olefile pytest-rerunfailures pytest-order -pymilvus +pymilvus>=2.4.7, <2.5.0 From f835a91b140d12d691d5262853b599be7b8784b9 Mon Sep 17 00:00:00 2001 From: ouguoyu Date: Tue, 17 Dec 2024 16:28:19 +0800 Subject: [PATCH 13/14] fix lint --- tests/basic_tests/test_store.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/tests/basic_tests/test_store.py b/tests/basic_tests/test_store.py index f07f22d5..240bc7cf 100644 --- a/tests/basic_tests/test_store.py +++ b/tests/basic_tests/test_store.py @@ -193,8 +193,8 @@ def setUp(self): self.global_metadata_desc = { 'comment': GlobalMetadataDesc(data_type=GlobalMetadataDesc.DTYPE_VARCHAR, max_size=65535, default_value=' '), 'signature': GlobalMetadataDesc(data_type=GlobalMetadataDesc.DTYPE_VARCHAR, max_size=256, default_value=' '), - 'tags': GlobalMetadataDesc(data_type=GlobalMetadataDesc.DTYPE_ARRAY, element_type=GlobalMetadataDesc.DTYPE_INT32, - max_size=128, default_value=[]), + 'tags': GlobalMetadataDesc(data_type=GlobalMetadataDesc.DTYPE_ARRAY, + element_type=GlobalMetadataDesc.DTYPE_INT32, max_size=128, default_value=[]), } self.node_groups = [LAZY_ROOT_NAME, "group1", "group2"] @@ -295,7 +295,8 @@ def test_reload(self): for orig_node in orig_nodes: if node._uid == orig_node._uid: self.assertEqual(node.text, orig_node.text) - # builtin fields are not in orig node, so we can not use node.global_metadata == orig_node.global_metadata + # builtin fields are not in orig node, so we can not use + # node.global_metadata == orig_node.global_metadata for k, v in orig_node.global_metadata.items(): self.assertEqual(node.global_metadata[k], v) break From d8635befc2d93083e177f3378909de86bd601494 Mon Sep 17 00:00:00 2001 From: ouguoyu Date: Wed, 18 Dec 2024 09:56:08 +0800 Subject: [PATCH 14/14] update test and trigger ci --- tests/basic_tests/test_store.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/basic_tests/test_store.py b/tests/basic_tests/test_store.py index 6872ebf3..4739fe2f 100644 --- a/tests/basic_tests/test_store.py +++ b/tests/basic_tests/test_store.py @@ -359,7 +359,7 @@ def setUp(self): }, { 'embed_key': 'vec2', - 'index_type': 'SPARSE_INVERTED_INDEX', + 'index_type': 'SPARSE_WAND', 'metric_type': 'IP', } ]