From 5079ed7059b782a77f5ac8478782f1505068d644 Mon Sep 17 00:00:00 2001 From: xiaorui1 Date: Mon, 20 Jan 2025 20:45:19 +0800 Subject: [PATCH 01/10] fix:run out of mem --- api/core/rag/datasource/vdb/vector_factory.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/api/core/rag/datasource/vdb/vector_factory.py b/api/core/rag/datasource/vdb/vector_factory.py index bdc40e29c7e838..858e7ed06f07a7 100644 --- a/api/core/rag/datasource/vdb/vector_factory.py +++ b/api/core/rag/datasource/vdb/vector_factory.py @@ -150,11 +150,14 @@ def get_vector_factory(vector_type: str) -> type[AbstractVectorFactory]: return OceanBaseVectorFactory case _: raise ValueError(f"Vector store {vector_type} is not supported.") - + + max_batch_documents = 1000 def create(self, texts: Optional[list] = None, **kwargs): if texts: - embeddings = self._embeddings.embed_documents([document.page_content for document in texts]) - self._vector_processor.create(texts=texts, embeddings=embeddings, **kwargs) + for i in range(0, len(texts), self.max_documents): + batch_documents = texts[i:i + self.max_documents] + batch_embeddings = self._embeddings.embed_documents([document.page_content for document in batch_documents]) + self._vector_processor.create(texts=batch_documents, embeddings=batch_embeddings, **kwargs) def add_texts(self, documents: list[Document], **kwargs): if kwargs.get("duplicate_check", False): From 42189fb2e4d4f43141f0eb83652ac36cd3ce70db Mon Sep 17 00:00:00 2001 From: rayshaw001 <396301947@163.com> Date: Mon, 20 Jan 2025 20:47:48 +0800 Subject: [PATCH 02/10] fix typo --- api/core/rag/datasource/vdb/vector_factory.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/api/core/rag/datasource/vdb/vector_factory.py b/api/core/rag/datasource/vdb/vector_factory.py index 858e7ed06f07a7..6d525f53f4132e 100644 --- a/api/core/rag/datasource/vdb/vector_factory.py +++ b/api/core/rag/datasource/vdb/vector_factory.py @@ -154,8 +154,8 @@ def get_vector_factory(vector_type: str) -> type[AbstractVectorFactory]: max_batch_documents = 1000 def create(self, texts: Optional[list] = None, **kwargs): if texts: - for i in range(0, len(texts), self.max_documents): - batch_documents = texts[i:i + self.max_documents] + for i in range(0, len(texts), self.max_batch_documents): + batch_documents = texts[i:i + self.max_batch_documents] batch_embeddings = self._embeddings.embed_documents([document.page_content for document in batch_documents]) self._vector_processor.create(texts=batch_documents, embeddings=batch_embeddings, **kwargs) From 7e71501910579dfc20d6a246d32bf6a52d98c42b Mon Sep 17 00:00:00 2001 From: rayshaw001 <396301947@163.com> Date: Tue, 21 Jan 2025 10:36:37 +0800 Subject: [PATCH 03/10] fix E501 --- api/core/rag/datasource/vdb/vector_factory.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/api/core/rag/datasource/vdb/vector_factory.py b/api/core/rag/datasource/vdb/vector_factory.py index 6d525f53f4132e..e2148309f8ca4c 100644 --- a/api/core/rag/datasource/vdb/vector_factory.py +++ b/api/core/rag/datasource/vdb/vector_factory.py @@ -156,7 +156,8 @@ def create(self, texts: Optional[list] = None, **kwargs): if texts: for i in range(0, len(texts), self.max_batch_documents): batch_documents = texts[i:i + self.max_batch_documents] - batch_embeddings = self._embeddings.embed_documents([document.page_content for document in batch_documents]) + batch_contents = [document.page_content for document in batch_documents] + batch_embeddings = self._embeddings.embed_documents(batch_contents) self._vector_processor.create(texts=batch_documents, embeddings=batch_embeddings, **kwargs) def add_texts(self, documents: list[Document], **kwargs): From a9ae412f618d0de51a7530700c429bc121e5d607 Mon Sep 17 00:00:00 2001 From: xiaorui1 Date: Tue, 21 Jan 2025 10:45:07 +0800 Subject: [PATCH 04/10] fix E301 --- api/core/rag/datasource/vdb/vector_factory.py | 1 + 1 file changed, 1 insertion(+) diff --git a/api/core/rag/datasource/vdb/vector_factory.py b/api/core/rag/datasource/vdb/vector_factory.py index e2148309f8ca4c..110e71423d2751 100644 --- a/api/core/rag/datasource/vdb/vector_factory.py +++ b/api/core/rag/datasource/vdb/vector_factory.py @@ -152,6 +152,7 @@ def get_vector_factory(vector_type: str) -> type[AbstractVectorFactory]: raise ValueError(f"Vector store {vector_type} is not supported.") max_batch_documents = 1000 + def create(self, texts: Optional[list] = None, **kwargs): if texts: for i in range(0, len(texts), self.max_batch_documents): From bb065883be1e0066bb2ae1995e62194b622bf1ef Mon Sep 17 00:00:00 2001 From: xiaorui1 Date: Tue, 21 Jan 2025 10:51:18 +0800 Subject: [PATCH 05/10] fix python lint --- api/core/rag/datasource/vdb/vector_factory.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/api/core/rag/datasource/vdb/vector_factory.py b/api/core/rag/datasource/vdb/vector_factory.py index 110e71423d2751..91e0db43c2d658 100644 --- a/api/core/rag/datasource/vdb/vector_factory.py +++ b/api/core/rag/datasource/vdb/vector_factory.py @@ -150,7 +150,7 @@ def get_vector_factory(vector_type: str) -> type[AbstractVectorFactory]: return OceanBaseVectorFactory case _: raise ValueError(f"Vector store {vector_type} is not supported.") - + max_batch_documents = 1000 def create(self, texts: Optional[list] = None, **kwargs): From 8dd961b0187a205f7ff32b8f355470fa9c675d90 Mon Sep 17 00:00:00 2001 From: xiaorui1 Date: Tue, 21 Jan 2025 11:01:17 +0800 Subject: [PATCH 06/10] fix py lint --- api/core/rag/datasource/vdb/vector_factory.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/api/core/rag/datasource/vdb/vector_factory.py b/api/core/rag/datasource/vdb/vector_factory.py index 91e0db43c2d658..d02b3425986eca 100644 --- a/api/core/rag/datasource/vdb/vector_factory.py +++ b/api/core/rag/datasource/vdb/vector_factory.py @@ -156,7 +156,7 @@ def get_vector_factory(vector_type: str) -> type[AbstractVectorFactory]: def create(self, texts: Optional[list] = None, **kwargs): if texts: for i in range(0, len(texts), self.max_batch_documents): - batch_documents = texts[i:i + self.max_batch_documents] + batch_documents = texts[i : i + self.max_batch_documents] batch_contents = [document.page_content for document in batch_documents] batch_embeddings = self._embeddings.embed_documents(batch_contents) self._vector_processor.create(texts=batch_documents, embeddings=batch_embeddings, **kwargs) From aef34ba38bbe42c13176a1fe275962e4ffa14e27 Mon Sep 17 00:00:00 2001 From: xiaorui1 Date: Tue, 21 Jan 2025 15:22:22 +0800 Subject: [PATCH 07/10] make max_batch_documents local var --- api/core/rag/datasource/vdb/vector_factory.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/api/core/rag/datasource/vdb/vector_factory.py b/api/core/rag/datasource/vdb/vector_factory.py index d02b3425986eca..ae4c57928b192e 100644 --- a/api/core/rag/datasource/vdb/vector_factory.py +++ b/api/core/rag/datasource/vdb/vector_factory.py @@ -151,12 +151,11 @@ def get_vector_factory(vector_type: str) -> type[AbstractVectorFactory]: case _: raise ValueError(f"Vector store {vector_type} is not supported.") - max_batch_documents = 1000 - def create(self, texts: Optional[list] = None, **kwargs): + max_batch_documents = 1000 if texts: - for i in range(0, len(texts), self.max_batch_documents): - batch_documents = texts[i : i + self.max_batch_documents] + for i in range(0, len(texts), max_batch_documents): + batch_documents = texts[i : i + max_batch_documents] batch_contents = [document.page_content for document in batch_documents] batch_embeddings = self._embeddings.embed_documents(batch_contents) self._vector_processor.create(texts=batch_documents, embeddings=batch_embeddings, **kwargs) From 57e6d8913f09fbb4ceb6c45f593f92186e64fe88 Mon Sep 17 00:00:00 2001 From: xiaorui1 Date: Tue, 21 Jan 2025 17:42:16 +0800 Subject: [PATCH 08/10] use add_texts method instead --- api/core/rag/datasource/vdb/vector_factory.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/api/core/rag/datasource/vdb/vector_factory.py b/api/core/rag/datasource/vdb/vector_factory.py index ae4c57928b192e..adcba442558a78 100644 --- a/api/core/rag/datasource/vdb/vector_factory.py +++ b/api/core/rag/datasource/vdb/vector_factory.py @@ -156,9 +156,7 @@ def create(self, texts: Optional[list] = None, **kwargs): if texts: for i in range(0, len(texts), max_batch_documents): batch_documents = texts[i : i + max_batch_documents] - batch_contents = [document.page_content for document in batch_documents] - batch_embeddings = self._embeddings.embed_documents(batch_contents) - self._vector_processor.create(texts=batch_documents, embeddings=batch_embeddings, **kwargs) + self.add_texts(batch_documents, duplicate_check=True, **kwargs) def add_texts(self, documents: list[Document], **kwargs): if kwargs.get("duplicate_check", False): From 87f16fc9c99d63b19aa03fe413dfb8e585fbcb90 Mon Sep 17 00:00:00 2001 From: rayshaw001 <396301947@163.com> Date: Wed, 22 Jan 2025 10:47:34 +0800 Subject: [PATCH 09/10] create collection once --- api/core/rag/datasource/vdb/vector_factory.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/api/core/rag/datasource/vdb/vector_factory.py b/api/core/rag/datasource/vdb/vector_factory.py index adcba442558a78..30e0a14e717511 100644 --- a/api/core/rag/datasource/vdb/vector_factory.py +++ b/api/core/rag/datasource/vdb/vector_factory.py @@ -156,7 +156,12 @@ def create(self, texts: Optional[list] = None, **kwargs): if texts: for i in range(0, len(texts), max_batch_documents): batch_documents = texts[i : i + max_batch_documents] - self.add_texts(batch_documents, duplicate_check=True, **kwargs) + batch_contents = [document.page_content for document in batch_documents] + batch_embeddings = self._embeddings.embed_documents(batch_contents) + if i < max_batch_documents: + self._vector_processor.create(texts=batch_documents, embeddings=batch_embeddings, **kwargs) + else: + self._vector_processor.add_texts(texts=batch_documents, embeddings=batch_embeddings, **kwargs) def add_texts(self, documents: list[Document], **kwargs): if kwargs.get("duplicate_check", False): From 388691323d38f6d62654d76a8ef4b80ddd4b24dc Mon Sep 17 00:00:00 2001 From: rayshaw001 <396301947@163.com> Date: Tue, 11 Feb 2025 19:48:24 +0800 Subject: [PATCH 10/10] fix param name --- api/core/rag/datasource/vdb/vector_factory.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/api/core/rag/datasource/vdb/vector_factory.py b/api/core/rag/datasource/vdb/vector_factory.py index 30e0a14e717511..6f3b6d53eb1374 100644 --- a/api/core/rag/datasource/vdb/vector_factory.py +++ b/api/core/rag/datasource/vdb/vector_factory.py @@ -161,7 +161,7 @@ def create(self, texts: Optional[list] = None, **kwargs): if i < max_batch_documents: self._vector_processor.create(texts=batch_documents, embeddings=batch_embeddings, **kwargs) else: - self._vector_processor.add_texts(texts=batch_documents, embeddings=batch_embeddings, **kwargs) + self._vector_processor.add_texts(documents=batch_documents, embeddings=batch_embeddings, **kwargs) def add_texts(self, documents: list[Document], **kwargs): if kwargs.get("duplicate_check", False):