From 0e04ce94e5bdee7a5cc4a96add7328ae46554cc4 Mon Sep 17 00:00:00 2001 From: kevin-yang-racap Date: Tue, 12 Nov 2024 13:53:17 -0500 Subject: [PATCH 1/5] Added dynamic node limit when retrieving get_rel_map for property graph --- .../property_graph/sub_retrievers/llm_synonym.py | 3 ++- .../property_graph/sub_retrievers/vector.py | 16 ++++++++-------- 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/llama-index-core/llama_index/core/indices/property_graph/sub_retrievers/llm_synonym.py b/llama-index-core/llama_index/core/indices/property_graph/sub_retrievers/llm_synonym.py index 17938c1ed77a5..d7091fc6dd051 100644 --- a/llama-index-core/llama_index/core/indices/property_graph/sub_retrievers/llm_synonym.py +++ b/llama-index-core/llama_index/core/indices/property_graph/sub_retrievers/llm_synonym.py @@ -86,11 +86,12 @@ def _parse_llm_output(self, output: str) -> List[str]: # capitalize to normalize with ingestion return [x.strip().capitalize() for x in matches if x.strip()] - def _prepare_matches(self, matches: List[str]) -> List[NodeWithScore]: + def _prepare_matches(self, matches: List[str], limit: int = 30) -> List[NodeWithScore]: kg_nodes = self._graph_store.get(ids=matches) triplets = self._graph_store.get_rel_map( kg_nodes, depth=self._path_depth, + limit=limit, ignore_rels=[KG_SOURCE_REL], ) diff --git a/llama-index-core/llama_index/core/indices/property_graph/sub_retrievers/vector.py b/llama-index-core/llama_index/core/indices/property_graph/sub_retrievers/vector.py index effbcd7e5471e..4d3af683a31b9 100644 --- a/llama-index-core/llama_index/core/indices/property_graph/sub_retrievers/vector.py +++ b/llama-index-core/llama_index/core/indices/property_graph/sub_retrievers/vector.py @@ -112,7 +112,7 @@ async def _aget_vector_store_query( **self._retriever_kwargs, ) - def retrieve_from_graph(self, query_bundle: QueryBundle) -> List[NodeWithScore]: + def retrieve_from_graph(self, query_bundle: QueryBundle, limit: int = 30) -> List[NodeWithScore]: vector_store_query = self._get_vector_store_query(query_bundle) triplets = [] @@ -126,7 +126,7 @@ def retrieve_from_graph(self, query_bundle: QueryBundle) -> List[NodeWithScore]: kg_ids = [node.id for node in kg_nodes] triplets = self._graph_store.get_rel_map( - kg_nodes, depth=self._path_depth, ignore_rels=[KG_SOURCE_REL] + kg_nodes, depth=self._path_depth, limit=limit, ignore_rels=[KG_SOURCE_REL] ) elif self._vector_store is not None: @@ -136,7 +136,7 @@ def retrieve_from_graph(self, query_bundle: QueryBundle) -> List[NodeWithScore]: scores = query_result.similarities kg_nodes = self._graph_store.get(ids=kg_ids) triplets = self._graph_store.get_rel_map( - kg_nodes, depth=self._path_depth, ignore_rels=[KG_SOURCE_REL] + kg_nodes, depth=self._path_depth, limit=limit, ignore_rels=[KG_SOURCE_REL] ) elif query_result.ids is not None and query_result.similarities is not None: @@ -144,7 +144,7 @@ def retrieve_from_graph(self, query_bundle: QueryBundle) -> List[NodeWithScore]: scores = query_result.similarities kg_nodes = self._graph_store.get(ids=kg_ids) triplets = self._graph_store.get_rel_map( - kg_nodes, depth=self._path_depth, ignore_rels=[KG_SOURCE_REL] + kg_nodes, depth=self._path_depth, limit=limit, ignore_rels=[KG_SOURCE_REL] ) for triplet in triplets: @@ -174,7 +174,7 @@ def retrieve_from_graph(self, query_bundle: QueryBundle) -> List[NodeWithScore]: return self._get_nodes_with_score([x[0] for x in top_k], [x[1] for x in top_k]) async def aretrieve_from_graph( - self, query_bundle: QueryBundle + self, query_bundle: QueryBundle, limit: int = 30 ) -> List[NodeWithScore]: vector_store_query = await self._aget_vector_store_query(query_bundle) @@ -189,7 +189,7 @@ async def aretrieve_from_graph( kg_nodes, scores = result kg_ids = [node.id for node in kg_nodes] triplets = await self._graph_store.aget_rel_map( - kg_nodes, depth=self._path_depth, ignore_rels=[KG_SOURCE_REL] + kg_nodes, depth=self._path_depth, limit=limit, ignore_rels=[KG_SOURCE_REL] ) elif self._vector_store is not None: @@ -199,7 +199,7 @@ async def aretrieve_from_graph( scores = query_result.similarities kg_nodes = await self._graph_store.aget(ids=kg_ids) triplets = await self._graph_store.aget_rel_map( - kg_nodes, depth=self._path_depth, ignore_rels=[KG_SOURCE_REL] + kg_nodes, depth=self._path_depth, limit=limit, ignore_rels=[KG_SOURCE_REL] ) elif query_result.ids is not None and query_result.similarities is not None: @@ -207,7 +207,7 @@ async def aretrieve_from_graph( scores = query_result.similarities kg_nodes = await self._graph_store.aget(ids=kg_ids) triplets = await self._graph_store.aget_rel_map( - kg_nodes, depth=self._path_depth, ignore_rels=[KG_SOURCE_REL] + kg_nodes, depth=self._path_depth, limit=limit, ignore_rels=[KG_SOURCE_REL] ) for triplet in triplets: From b4f8f38d27d886d8d37e8ce4c44837393fc350f9 Mon Sep 17 00:00:00 2001 From: kevin-yang-racap Date: Tue, 12 Nov 2024 13:55:59 -0500 Subject: [PATCH 2/5] added limit to llm_synonym --- .../indices/property_graph/sub_retrievers/llm_synonym.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/llama-index-core/llama_index/core/indices/property_graph/sub_retrievers/llm_synonym.py b/llama-index-core/llama_index/core/indices/property_graph/sub_retrievers/llm_synonym.py index d7091fc6dd051..61008cd5eb11f 100644 --- a/llama-index-core/llama_index/core/indices/property_graph/sub_retrievers/llm_synonym.py +++ b/llama-index-core/llama_index/core/indices/property_graph/sub_retrievers/llm_synonym.py @@ -107,24 +107,24 @@ async def _aprepare_matches(self, matches: List[str]) -> List[NodeWithScore]: return self._get_nodes_with_score(triplets) - def retrieve_from_graph(self, query_bundle: QueryBundle) -> List[NodeWithScore]: + def retrieve_from_graph(self, query_bundle: QueryBundle, limit: int = 30) -> List[NodeWithScore]: response = self._llm.predict( self._synonym_prompt, query_str=query_bundle.query_str, max_keywords=self._max_keywords, ) - matches = self._parse_llm_output(response) + matches = self._parse_llm_output(response, limit=limit) return self._prepare_matches(matches) async def aretrieve_from_graph( - self, query_bundle: QueryBundle + self, query_bundle: QueryBundle, limit: int = 30 ) -> List[NodeWithScore]: response = await self._llm.apredict( self._synonym_prompt, query_str=query_bundle.query_str, max_keywords=self._max_keywords, ) - matches = self._parse_llm_output(response) + matches = self._parse_llm_output(response, limit=limit) return await self._aprepare_matches(matches) From 2a8d57e9776aa340f85537391456127b8fbc1143 Mon Sep 17 00:00:00 2001 From: kevin-yang-racap Date: Tue, 12 Nov 2024 14:02:42 -0500 Subject: [PATCH 3/5] Added default limit when declaring retriever --- .../sub_retrievers/llm_synonym.py | 19 +++++++++++-------- .../property_graph/sub_retrievers/vector.py | 18 ++++++++++-------- 2 files changed, 21 insertions(+), 16 deletions(-) diff --git a/llama-index-core/llama_index/core/indices/property_graph/sub_retrievers/llm_synonym.py b/llama-index-core/llama_index/core/indices/property_graph/sub_retrievers/llm_synonym.py index 61008cd5eb11f..eabf08d3e0864 100644 --- a/llama-index-core/llama_index/core/indices/property_graph/sub_retrievers/llm_synonym.py +++ b/llama-index-core/llama_index/core/indices/property_graph/sub_retrievers/llm_synonym.py @@ -59,6 +59,7 @@ def __init__( ] = DEFAULT_SYNONYM_EXPAND_TEMPLATE, max_keywords: int = 10, path_depth: int = 1, + limit: int = 30, output_parsing_fn: Optional[Callable] = None, llm: Optional[LLM] = None, **kwargs: Any, @@ -70,6 +71,7 @@ def __init__( self._output_parsing_fn = output_parsing_fn self._max_keywords = max_keywords self._path_depth = path_depth + self._limit = limit super().__init__( graph_store=graph_store, include_text=include_text, @@ -86,7 +88,7 @@ def _parse_llm_output(self, output: str) -> List[str]: # capitalize to normalize with ingestion return [x.strip().capitalize() for x in matches if x.strip()] - def _prepare_matches(self, matches: List[str], limit: int = 30) -> List[NodeWithScore]: + def _prepare_matches(self, matches: List[str], limit: Optional[int] = 30) -> List[NodeWithScore]: kg_nodes = self._graph_store.get(ids=matches) triplets = self._graph_store.get_rel_map( kg_nodes, @@ -97,34 +99,35 @@ def _prepare_matches(self, matches: List[str], limit: int = 30) -> List[NodeWith return self._get_nodes_with_score(triplets) - async def _aprepare_matches(self, matches: List[str]) -> List[NodeWithScore]: + async def _aprepare_matches(self, matches: List[str], limit: Optional[int] = None) -> List[NodeWithScore]: kg_nodes = await self._graph_store.aget(ids=matches) triplets = await self._graph_store.aget_rel_map( kg_nodes, depth=self._path_depth, + limit=limit, ignore_rels=[KG_SOURCE_REL], ) return self._get_nodes_with_score(triplets) - def retrieve_from_graph(self, query_bundle: QueryBundle, limit: int = 30) -> List[NodeWithScore]: + def retrieve_from_graph(self, query_bundle: QueryBundle, limit: Optional[int] = None) -> List[NodeWithScore]: response = self._llm.predict( self._synonym_prompt, query_str=query_bundle.query_str, max_keywords=self._max_keywords, ) - matches = self._parse_llm_output(response, limit=limit) + matches = self._parse_llm_output(response) - return self._prepare_matches(matches) + return self._prepare_matches(matches, limit=limit or self._limit) async def aretrieve_from_graph( - self, query_bundle: QueryBundle, limit: int = 30 + self, query_bundle: QueryBundle, limit: Optional[int] = None ) -> List[NodeWithScore]: response = await self._llm.apredict( self._synonym_prompt, query_str=query_bundle.query_str, max_keywords=self._max_keywords, ) - matches = self._parse_llm_output(response, limit=limit) + matches = self._parse_llm_output(response) - return await self._aprepare_matches(matches) + return await self._aprepare_matches(matches, limit=limit or self._limit) diff --git a/llama-index-core/llama_index/core/indices/property_graph/sub_retrievers/vector.py b/llama-index-core/llama_index/core/indices/property_graph/sub_retrievers/vector.py index 4d3af683a31b9..6f2393b025bb1 100644 --- a/llama-index-core/llama_index/core/indices/property_graph/sub_retrievers/vector.py +++ b/llama-index-core/llama_index/core/indices/property_graph/sub_retrievers/vector.py @@ -49,6 +49,7 @@ def __init__( vector_store: Optional[BasePydanticVectorStore] = None, similarity_top_k: int = 4, path_depth: int = 1, + limit: int = 30, similarity_score: Optional[float] = None, filters: Optional[MetadataFilters] = None, **kwargs: Any, @@ -58,6 +59,7 @@ def __init__( self._similarity_top_k = similarity_top_k self._vector_store = vector_store self._path_depth = path_depth + self._limit = limit self._similarity_score = similarity_score self._filters = filters @@ -112,7 +114,7 @@ async def _aget_vector_store_query( **self._retriever_kwargs, ) - def retrieve_from_graph(self, query_bundle: QueryBundle, limit: int = 30) -> List[NodeWithScore]: + def retrieve_from_graph(self, query_bundle: QueryBundle, limit: Optional[int] = None) -> List[NodeWithScore]: vector_store_query = self._get_vector_store_query(query_bundle) triplets = [] @@ -126,7 +128,7 @@ def retrieve_from_graph(self, query_bundle: QueryBundle, limit: int = 30) -> Lis kg_ids = [node.id for node in kg_nodes] triplets = self._graph_store.get_rel_map( - kg_nodes, depth=self._path_depth, limit=limit, ignore_rels=[KG_SOURCE_REL] + kg_nodes, depth=self._path_depth, limit=limit or self._limit, ignore_rels=[KG_SOURCE_REL] ) elif self._vector_store is not None: @@ -136,7 +138,7 @@ def retrieve_from_graph(self, query_bundle: QueryBundle, limit: int = 30) -> Lis scores = query_result.similarities kg_nodes = self._graph_store.get(ids=kg_ids) triplets = self._graph_store.get_rel_map( - kg_nodes, depth=self._path_depth, limit=limit, ignore_rels=[KG_SOURCE_REL] + kg_nodes, depth=self._path_depth, limit=limit or self._limit, ignore_rels=[KG_SOURCE_REL] ) elif query_result.ids is not None and query_result.similarities is not None: @@ -144,7 +146,7 @@ def retrieve_from_graph(self, query_bundle: QueryBundle, limit: int = 30) -> Lis scores = query_result.similarities kg_nodes = self._graph_store.get(ids=kg_ids) triplets = self._graph_store.get_rel_map( - kg_nodes, depth=self._path_depth, limit=limit, ignore_rels=[KG_SOURCE_REL] + kg_nodes, depth=self._path_depth, limit=limit or self._limit, ignore_rels=[KG_SOURCE_REL] ) for triplet in triplets: @@ -174,7 +176,7 @@ def retrieve_from_graph(self, query_bundle: QueryBundle, limit: int = 30) -> Lis return self._get_nodes_with_score([x[0] for x in top_k], [x[1] for x in top_k]) async def aretrieve_from_graph( - self, query_bundle: QueryBundle, limit: int = 30 + self, query_bundle: QueryBundle, limit: Optional[int] = None ) -> List[NodeWithScore]: vector_store_query = await self._aget_vector_store_query(query_bundle) @@ -189,7 +191,7 @@ async def aretrieve_from_graph( kg_nodes, scores = result kg_ids = [node.id for node in kg_nodes] triplets = await self._graph_store.aget_rel_map( - kg_nodes, depth=self._path_depth, limit=limit, ignore_rels=[KG_SOURCE_REL] + kg_nodes, depth=self._path_depth, limit=limit or self._limit, ignore_rels=[KG_SOURCE_REL] ) elif self._vector_store is not None: @@ -199,7 +201,7 @@ async def aretrieve_from_graph( scores = query_result.similarities kg_nodes = await self._graph_store.aget(ids=kg_ids) triplets = await self._graph_store.aget_rel_map( - kg_nodes, depth=self._path_depth, limit=limit, ignore_rels=[KG_SOURCE_REL] + kg_nodes, depth=self._path_depth, limit=limit or self._limit, ignore_rels=[KG_SOURCE_REL] ) elif query_result.ids is not None and query_result.similarities is not None: @@ -207,7 +209,7 @@ async def aretrieve_from_graph( scores = query_result.similarities kg_nodes = await self._graph_store.aget(ids=kg_ids) triplets = await self._graph_store.aget_rel_map( - kg_nodes, depth=self._path_depth, limit=limit, ignore_rels=[KG_SOURCE_REL] + kg_nodes, depth=self._path_depth, limit=limit or self._limit, ignore_rels=[KG_SOURCE_REL] ) for triplet in triplets: From 9a113283b2f2a4f8444daf52128275666ff41b79 Mon Sep 17 00:00:00 2001 From: kevin-yang-racap Date: Tue, 12 Nov 2024 14:10:33 -0500 Subject: [PATCH 4/5] Missed two defaulted limits --- .../indices/property_graph/sub_retrievers/llm_synonym.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/llama-index-core/llama_index/core/indices/property_graph/sub_retrievers/llm_synonym.py b/llama-index-core/llama_index/core/indices/property_graph/sub_retrievers/llm_synonym.py index eabf08d3e0864..976eef06b6797 100644 --- a/llama-index-core/llama_index/core/indices/property_graph/sub_retrievers/llm_synonym.py +++ b/llama-index-core/llama_index/core/indices/property_graph/sub_retrievers/llm_synonym.py @@ -88,12 +88,12 @@ def _parse_llm_output(self, output: str) -> List[str]: # capitalize to normalize with ingestion return [x.strip().capitalize() for x in matches if x.strip()] - def _prepare_matches(self, matches: List[str], limit: Optional[int] = 30) -> List[NodeWithScore]: + def _prepare_matches(self, matches: List[str], limit: Optional[int] = None) -> List[NodeWithScore]: kg_nodes = self._graph_store.get(ids=matches) triplets = self._graph_store.get_rel_map( kg_nodes, depth=self._path_depth, - limit=limit, + limit=limit or self._limit, ignore_rels=[KG_SOURCE_REL], ) @@ -104,7 +104,7 @@ async def _aprepare_matches(self, matches: List[str], limit: Optional[int] = Non triplets = await self._graph_store.aget_rel_map( kg_nodes, depth=self._path_depth, - limit=limit, + limit=limit or self._limit, ignore_rels=[KG_SOURCE_REL], ) From 51091352f549115169e143c4fed751cade0fad28 Mon Sep 17 00:00:00 2001 From: Logan Markewich Date: Tue, 12 Nov 2024 16:23:27 -0600 Subject: [PATCH 5/5] linting --- .../sub_retrievers/llm_synonym.py | 12 +++++-- .../property_graph/sub_retrievers/vector.py | 34 +++++++++++++++---- 2 files changed, 36 insertions(+), 10 deletions(-) diff --git a/llama-index-core/llama_index/core/indices/property_graph/sub_retrievers/llm_synonym.py b/llama-index-core/llama_index/core/indices/property_graph/sub_retrievers/llm_synonym.py index 976eef06b6797..4248af17fc41b 100644 --- a/llama-index-core/llama_index/core/indices/property_graph/sub_retrievers/llm_synonym.py +++ b/llama-index-core/llama_index/core/indices/property_graph/sub_retrievers/llm_synonym.py @@ -88,7 +88,9 @@ def _parse_llm_output(self, output: str) -> List[str]: # capitalize to normalize with ingestion return [x.strip().capitalize() for x in matches if x.strip()] - def _prepare_matches(self, matches: List[str], limit: Optional[int] = None) -> List[NodeWithScore]: + def _prepare_matches( + self, matches: List[str], limit: Optional[int] = None + ) -> List[NodeWithScore]: kg_nodes = self._graph_store.get(ids=matches) triplets = self._graph_store.get_rel_map( kg_nodes, @@ -99,7 +101,9 @@ def _prepare_matches(self, matches: List[str], limit: Optional[int] = None) -> L return self._get_nodes_with_score(triplets) - async def _aprepare_matches(self, matches: List[str], limit: Optional[int] = None) -> List[NodeWithScore]: + async def _aprepare_matches( + self, matches: List[str], limit: Optional[int] = None + ) -> List[NodeWithScore]: kg_nodes = await self._graph_store.aget(ids=matches) triplets = await self._graph_store.aget_rel_map( kg_nodes, @@ -110,7 +114,9 @@ async def _aprepare_matches(self, matches: List[str], limit: Optional[int] = Non return self._get_nodes_with_score(triplets) - def retrieve_from_graph(self, query_bundle: QueryBundle, limit: Optional[int] = None) -> List[NodeWithScore]: + def retrieve_from_graph( + self, query_bundle: QueryBundle, limit: Optional[int] = None + ) -> List[NodeWithScore]: response = self._llm.predict( self._synonym_prompt, query_str=query_bundle.query_str, diff --git a/llama-index-core/llama_index/core/indices/property_graph/sub_retrievers/vector.py b/llama-index-core/llama_index/core/indices/property_graph/sub_retrievers/vector.py index 6f2393b025bb1..93611fd3b14be 100644 --- a/llama-index-core/llama_index/core/indices/property_graph/sub_retrievers/vector.py +++ b/llama-index-core/llama_index/core/indices/property_graph/sub_retrievers/vector.py @@ -114,7 +114,9 @@ async def _aget_vector_store_query( **self._retriever_kwargs, ) - def retrieve_from_graph(self, query_bundle: QueryBundle, limit: Optional[int] = None) -> List[NodeWithScore]: + def retrieve_from_graph( + self, query_bundle: QueryBundle, limit: Optional[int] = None + ) -> List[NodeWithScore]: vector_store_query = self._get_vector_store_query(query_bundle) triplets = [] @@ -128,7 +130,10 @@ def retrieve_from_graph(self, query_bundle: QueryBundle, limit: Optional[int] = kg_ids = [node.id for node in kg_nodes] triplets = self._graph_store.get_rel_map( - kg_nodes, depth=self._path_depth, limit=limit or self._limit, ignore_rels=[KG_SOURCE_REL] + kg_nodes, + depth=self._path_depth, + limit=limit or self._limit, + ignore_rels=[KG_SOURCE_REL], ) elif self._vector_store is not None: @@ -138,7 +143,10 @@ def retrieve_from_graph(self, query_bundle: QueryBundle, limit: Optional[int] = scores = query_result.similarities kg_nodes = self._graph_store.get(ids=kg_ids) triplets = self._graph_store.get_rel_map( - kg_nodes, depth=self._path_depth, limit=limit or self._limit, ignore_rels=[KG_SOURCE_REL] + kg_nodes, + depth=self._path_depth, + limit=limit or self._limit, + ignore_rels=[KG_SOURCE_REL], ) elif query_result.ids is not None and query_result.similarities is not None: @@ -146,7 +154,10 @@ def retrieve_from_graph(self, query_bundle: QueryBundle, limit: Optional[int] = scores = query_result.similarities kg_nodes = self._graph_store.get(ids=kg_ids) triplets = self._graph_store.get_rel_map( - kg_nodes, depth=self._path_depth, limit=limit or self._limit, ignore_rels=[KG_SOURCE_REL] + kg_nodes, + depth=self._path_depth, + limit=limit or self._limit, + ignore_rels=[KG_SOURCE_REL], ) for triplet in triplets: @@ -191,7 +202,10 @@ async def aretrieve_from_graph( kg_nodes, scores = result kg_ids = [node.id for node in kg_nodes] triplets = await self._graph_store.aget_rel_map( - kg_nodes, depth=self._path_depth, limit=limit or self._limit, ignore_rels=[KG_SOURCE_REL] + kg_nodes, + depth=self._path_depth, + limit=limit or self._limit, + ignore_rels=[KG_SOURCE_REL], ) elif self._vector_store is not None: @@ -201,7 +215,10 @@ async def aretrieve_from_graph( scores = query_result.similarities kg_nodes = await self._graph_store.aget(ids=kg_ids) triplets = await self._graph_store.aget_rel_map( - kg_nodes, depth=self._path_depth, limit=limit or self._limit, ignore_rels=[KG_SOURCE_REL] + kg_nodes, + depth=self._path_depth, + limit=limit or self._limit, + ignore_rels=[KG_SOURCE_REL], ) elif query_result.ids is not None and query_result.similarities is not None: @@ -209,7 +226,10 @@ async def aretrieve_from_graph( scores = query_result.similarities kg_nodes = await self._graph_store.aget(ids=kg_ids) triplets = await self._graph_store.aget_rel_map( - kg_nodes, depth=self._path_depth, limit=limit or self._limit, ignore_rels=[KG_SOURCE_REL] + kg_nodes, + depth=self._path_depth, + limit=limit or self._limit, + ignore_rels=[KG_SOURCE_REL], ) for triplet in triplets: