Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: metrics in neo4j adapter [COG-1082] #487

Merged
merged 33 commits into from
Feb 7, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
458eeac
Count the number of tokens in documents
alekszievr Jan 28, 2025
51eadef
Merge branch 'COG-970-refactor-tokenizing' into feat/cog-1071-input-t…
alekszievr Jan 28, 2025
ba608a4
Merge branch 'COG-970-refactor-tokenizing' into feat/cog-1071-input-t…
alekszievr Jan 28, 2025
f6663ab
save token count to relational db
alekszievr Jan 28, 2025
9182be8
Merge branch 'COG-970-refactor-tokenizing' into feat/cog-1132-add-num…
alekszievr Jan 28, 2025
72dfec4
Add metrics to metric table
alekszievr Jan 28, 2025
9bd5917
Merge branch 'dev' into feat/cog-1071-input-token-counting
dexters1 Jan 29, 2025
227d94e
Merge branch 'feat/cog-1071-input-token-counting' into feat/cog-1132-…
alekszievr Jan 29, 2025
22b6459
Store list as json instead of array in relational db table
alekszievr Jan 29, 2025
9764441
Merge branch 'dev' into feat/cog-1132-add-num-tokens-to-metric-table
alekszievr Jan 29, 2025
100e7d7
Sum in sql instead of python
alekszievr Jan 29, 2025
c182d47
Unify naming
alekszievr Jan 29, 2025
44fa2cd
Return data_points in descriptive metric calculation task
alekszievr Jan 29, 2025
06030ff
Graph metrics getter template in graph db interface and adapters
alekszievr Jan 29, 2025
67d9908
Calculate descriptive metrics in networkx adapter
alekszievr Jan 29, 2025
252ac7f
neo4j metrics
alekszievr Jan 29, 2025
48a51a3
Merge branch 'dev' into feat/cog-1082-metrics-in-graphdb-interface
alekszievr Jan 30, 2025
9a94db8
remove _table from table name
alekszievr Jan 30, 2025
57fb338
Merge branch 'dev' into feat/cog-1082-metrics-in-graphdb-interface
alekszievr Jan 31, 2025
e8dcef1
Merge branch 'dev' into feat/cog-1082-metrics-in-graphdb-interface
alekszievr Feb 1, 2025
b0f6ba7
Merge branch 'dev' into feat/cog-1082-metrics-in-graphdb-interface
alekszievr Feb 3, 2025
05138fa
Use modules for adding to db instead of infrastructure
alekszievr Feb 3, 2025
f064f52
Merge branch 'feat/cog-1082-metrics-in-graphdb-interface' into feat/c…
alekszievr Feb 3, 2025
c9ee1bc
Merge branch 'feat/cog-1082-metrics-in-networkx-adapter' into feat/co…
alekszievr Feb 3, 2025
af8e798
Merge branch 'dev' into feat/cog-1082-metrics-in-networkx-adapter
alekszievr Feb 3, 2025
406057f
Merge branch 'feat/cog-1082-metrics-in-networkx-adapter' into feat/co…
alekszievr Feb 3, 2025
d93b5f5
minor fixes
alekszievr Feb 3, 2025
c13fdec
minor cleanup
alekszievr Feb 3, 2025
f2ad1d4
Merge branch 'dev' into feat/cog-1082-metrics-in-neo4j-adapter
alekszievr Feb 3, 2025
3e67828
Remove graph metric calculation from the default cognify pipeline
alekszievr Feb 4, 2025
58e5275
Merge branch 'dev' into feat/cog-1082-metrics-in-neo4j-adapter
alekszievr Feb 4, 2025
dc06b50
Merge branch 'dev' into feat/cog-1082-metrics-in-neo4j-adapter
alekszievr Feb 5, 2025
91b42ab
Merge branch 'dev' into feat/cog-1082-metrics-in-neo4j-adapter
alekszievr Feb 7, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion cognee/api/v1/cognify/cognify_v2.py
Original file line number Diff line number Diff line change
Expand Up @@ -165,7 +165,6 @@ async def get_default_tasks(
task_config={"batch_size": 10},
),
Task(add_data_points, task_config={"batch_size": 10}),
Task(store_descriptive_metrics, include_optional=True),
]
except Exception as error:
send_telemetry("cognee.cognify DEFAULT TASKS CREATION ERRORED", user.id)
Expand Down
150 changes: 139 additions & 11 deletions cognee/infrastructure/databases/graph/neo4j_driver/adapter.py
Original file line number Diff line number Diff line change
Expand Up @@ -531,16 +531,144 @@ async def get_filtered_graph_data(self, attribute_filters):

return (nodes, edges)

async def graph_exists(self, graph_name="myGraph"):
query = "CALL gds.graph.list() YIELD graphName RETURN collect(graphName) AS graphNames;"
result = await self.query(query)
graph_names = result[0]["graphNames"] if result else []
return graph_name in graph_names

async def project_entire_graph(self, graph_name="myGraph"):
"""
Projects all node labels and all relationship types into an in-memory GDS graph.
"""
if await self.graph_exists(graph_name):
return

node_labels_query = "CALL db.labels() YIELD label RETURN collect(label) AS labels;"
node_labels_result = await self.query(node_labels_query)
node_labels = node_labels_result[0]["labels"] if node_labels_result else []

relationship_types_query = "CALL db.relationshipTypes() YIELD relationshipType RETURN collect(relationshipType) AS relationships;"
relationship_types_result = await self.query(relationship_types_query)
relationship_types = (
relationship_types_result[0]["relationships"] if relationship_types_result else []
)

if not node_labels or not relationship_types:
raise ValueError("No node labels or relationship types found in the database.")

node_labels_str = "[" + ", ".join(f"'{label}'" for label in node_labels) + "]"
relationship_types_str = "[" + ", ".join(f"'{rel}'" for rel in relationship_types) + "]"

query = f"""
CALL gds.graph.project(
'{graph_name}',
{node_labels_str},
{relationship_types_str}
) YIELD graphName;
"""

await self.query(query)

async def drop_graph(self, graph_name="myGraph"):
if await self.graph_exists(graph_name):
drop_query = f"CALL gds.graph.drop('{graph_name}');"
await self.query(drop_query)

async def get_graph_metrics(self, include_optional=False):
return {
"num_nodes": -1,
"num_edges": -1,
"mean_degree": -1,
"edge_density": -1,
"num_connected_components": -1,
"sizes_of_connected_components": -1,
"num_selfloops": -1,
"diameter": -1,
"avg_shortest_path_length": -1,
"avg_clustering": -1,
nodes, edges = await self.get_model_independent_graph_data()
graph_name = "myGraph"
await self.drop_graph(graph_name)
await self.project_entire_graph(graph_name)

async def _get_edge_density():
query = """
MATCH (n)
WITH count(n) AS num_nodes
MATCH ()-[r]->()
WITH num_nodes, count(r) AS num_edges
RETURN CASE
WHEN num_nodes < 2 THEN 0
ELSE num_edges * 1.0 / (num_nodes * (num_nodes - 1))
END AS edge_density;
"""
result = await self.query(query)
return result[0]["edge_density"] if result else 0

async def _get_num_connected_components():
await self.drop_graph(graph_name)
alekszievr marked this conversation as resolved.
Show resolved Hide resolved
await self.project_entire_graph(graph_name)

query = f"""
CALL gds.wcc.stream('{graph_name}')
YIELD componentId
RETURN count(DISTINCT componentId) AS num_connected_components;
"""

result = await self.query(query)
return result[0]["num_connected_components"] if result else 0

async def _get_size_of_connected_components():
await self.drop_graph(graph_name)
await self.project_entire_graph(graph_name)

query = f"""
CALL gds.wcc.stream('{graph_name}')
YIELD componentId
RETURN componentId, count(*) AS size
ORDER BY size DESC;
"""

result = await self.query(query)
return [record["size"] for record in result] if result else []

async def _count_self_loops():
query = """
MATCH (n)-[r]->(n)
RETURN count(r) AS self_loop_count;
"""
result = await self.query(query)
return result[0]["self_loop_count"] if result else 0

async def _get_diameter():
logging.warning("Diameter calculation is not implemented for neo4j.")
return -1

async def _get_avg_shortest_path_length():
logging.warning(
"Average shortest path length calculation is not implemented for neo4j."
)
return -1

async def _get_avg_clustering():
logging.warning("Average clustering calculation is not implemented for neo4j.")
return -1

num_nodes = len(nodes[0]["nodes"])
num_edges = len(edges[0]["elements"])

mandatory_metrics = {
"num_nodes": num_nodes,
"num_edges": num_edges,
"mean_degree": (2 * num_edges) / num_nodes if num_nodes != 0 else None,
"edge_density": await _get_edge_density(),
"num_connected_components": await _get_num_connected_components(),
"sizes_of_connected_components": await _get_size_of_connected_components(),
}

if include_optional:
optional_metrics = {
"num_selfloops": await _count_self_loops(),
"diameter": await _get_diameter(),
"avg_shortest_path_length": await _get_avg_shortest_path_length(),
"avg_clustering": await _get_avg_clustering(),
}
else:
optional_metrics = {
"num_selfloops": -1,
"diameter": -1,
"avg_shortest_path_length": -1,
"avg_clustering": -1,
}

return mandatory_metrics | optional_metrics
4 changes: 2 additions & 2 deletions cognee/modules/data/models/GraphMetrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,5 +24,5 @@ class GraphMetrics(Base):
avg_shortest_path_length = Column(Float, nullable=True)
avg_clustering = Column(Float, nullable=True)

created_at = Column(DateTime(timezone=True), default=lambda: datetime.now(timezone.utc))
updated_at = Column(DateTime(timezone=True), onupdate=lambda: datetime.now(timezone.utc))
created_at = Column(DateTime(timezone=True), server_default=func.now())
updated_at = Column(DateTime(timezone=True), onupdate=func.now())
Loading