Skip to content

Commit

Permalink
Merge pull request #58 from Bears-R-Us/node_property_storage_and_quer…
Browse files Browse the repository at this point in the history
…ying

Node property storage and querying
  • Loading branch information
zhihuidu authored Oct 18, 2023
2 parents 1e8396f + cce8799 commit 53366f4
Show file tree
Hide file tree
Showing 6 changed files with 822 additions and 95 deletions.
43 changes: 29 additions & 14 deletions arachne/arachne_sample.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
"""
import argparse
import statistics as st
import time
import arkouda as ak
import arachne as ar
Expand All @@ -33,6 +34,7 @@ def create_parser():
parser.add_argument("m", type=int, default=1000000, help="Number of edges for graph")
parser.add_argument("x", type=int, default=10, help="Number of labels for graph")
parser.add_argument("y", type=int, default=10, help="Number of relationships for graph")
parser.add_argument("s", type=int, default=2, help="Random seed to persist results.")

return parser

Expand All @@ -53,8 +55,8 @@ def create_parser():

### Build graph from randomly generated source and destination arrays.
# 1. Use Arkouda's randint to generate the random edge arrays.
src = ak.randint(0, args.n, args.m)
dst = ak.randint(0, args.n, args.m)
src = ak.randint(0, args.n, args.m, seed=args.s*2)
dst = ak.randint(0, args.n, args.m, seed=args.s*4)

# 2. Build property graph from randomly generated edges.
print()
Expand All @@ -75,11 +77,11 @@ def create_parser():

# 2. Generate random array of vertices with original vertex values.
vertices = prop_graph.nodes()
vertices_with_labels = ak.randint(0, len(prop_graph), len(prop_graph), seed=512)
vertices_with_labels = ak.randint(0, len(prop_graph), len(prop_graph), seed=args.s*8)
vertices_with_labels = vertices[vertices_with_labels]

# 3. Generate random array of labels of the same size as the random array of vertices above.
random_labels = ak.randint(0, len(labels), len(vertices_with_labels), seed=256)
random_labels = ak.randint(0, len(labels), len(vertices_with_labels), seed=args.s*16)
random_labels = labels[random_labels]

# 4. Pack the values into a dataframe and populate them into the graph.
Expand All @@ -99,12 +101,12 @@ def create_parser():
relationships = ak.array(relationships_list)

# 2. Generate random array of edges with original vertex values.
edges_with_relationships = ak.randint(0, prop_graph.size(), prop_graph.size(), seed=128)
edges_with_relationships = ak.randint(0, prop_graph.size(), prop_graph.size(), seed=args.s*32)
src_vertices_with_relationships = src[edges_with_relationships]
dst_vertices_with_relationships = dst[edges_with_relationships]

# 3. Generate random array of relationships of the same size as the random array of edges above.
random_relationships = ak.randint(0, len(relationships), len(edges_with_relationships), seed=64)
random_relationships = ak.randint(0, len(relationships), len(edges_with_relationships), seed=args.s*64)
random_relationships = relationships[random_relationships]

# 4. Pack the values into a dataframe and populate them into the graph.
Expand Down Expand Up @@ -160,21 +162,34 @@ def create_parser():
print()
print("### BUILD NEW DIGRAPH FROM ONE_PATH EDGE RESULTS AND RUN BREADTH-FIRST SEARCH ON IT.")
# 1. Build the graph first.
graph = ar.DiGraph()
graph = ar.Graph()
start = time.time()
graph.add_edges_from(queried_edges[0], queried_edges[1])
end = time.time()
build_time = end - start
print(f"Building graph with {len(graph)} vertices and {graph.size()} edges took "
f"{round(build_time,2)} seconds.")

# 2. Run breadth-first search on the graph from highest out degree node.
highest_out_degree = ak.argmax(graph.out_degree())
start = time.time()
depths = ar.bfs_layers(graph, int(graph.nodes()[highest_out_degree]))
end = time.time()
bfs_time = round(end-start,2)
print(f"Running breadth-first search on directed graph took {bfs_time} seconds.")
# 2. Run breadth-first search on the graph from highest five degree nodes.
five_highest_out_degrees = ak.argsort(graph.degree())[:5]
node_map = graph.nodes()
bfs_times = []
for internal_node_index in five_highest_out_degrees.to_list():
u = node_map[internal_node_index]
start = time.time()
depths = ar.bfs_layers(graph, int(u))
end = time.time()
bfs_time = round(end-start,2)
bfs_times.append(bfs_time)
value_count = ak.value_counts(depths)
if value_count[0][0] == -1:
reachable_nodes = len(depths) - value_count[1][0]
else:
reachable_nodes = len(depths)
avg_runtime = round(st.mean(bfs_times),2)
print(f"Running breadth-first search from {u} took {bfs_time} seconds and reaches "
f"{reachable_nodes} nodes.")
print(f"Running breadth-first search took on average {avg_runtime} seconds")

# 3. Use depth to return one of the vertices with highest depth.
print(f"One of the vertices with highest depth was: {graph.nodes()[ak.argmax(depths)]}.")
Expand Down
148 changes: 126 additions & 22 deletions arachne/client/arachne/arachne.py
Original file line number Diff line number Diff line change
Expand Up @@ -566,7 +566,7 @@ def add_node_labels(self, labels:ak.DataFrame) -> None:
Parameters
----------
labels
labels : ak.DataFrame
`ak.DataFrame({"vertex_ids" : vertices, "vertex_labels" : labels})`
Returns
Expand All @@ -577,54 +577,99 @@ def add_node_labels(self, labels:ak.DataFrame) -> None:

### Preprocessing steps for faster back-end array population.
# 0. Extract the vertex ids and vertex labels from the dataframe.
vertex_ids = labels["vertex_ids"]
vertex_labels = labels["vertex_labels"]
columns = labels.columns
vertex_ids = labels[columns[0]]
vertex_labels = labels[columns[1]]

# 1. Broadcast string label names to int values and extract the label str to int id map.
start = time.time()
gb_labels = ak.GroupBy(vertex_labels)
new_label_ids = ak.arange(gb_labels.unique_keys.size)
vertex_labels = gb_labels.broadcast(new_label_ids)
label_mapper = gb_labels.unique_keys
end = time.time()
label_id_time = round(end-start,2)

# 2. Convert the vertex_ids to internal vertex_ids.
start = time.time()
vertex_map = self.nodes()
inds = ak.in1d(vertex_ids, vertex_map)
vertex_ids = vertex_ids[inds]
vertex_labels = vertex_labels[inds]
vertex_ids = ak.find(vertex_ids, vertex_map)
end = time.time()
internal_id_time = round(end-start,2)

# 3. GroupBy of the vertex ids and labels.
start = time.time()
gb_vertex_ids_and_labels = ak.GroupBy([vertex_ids,vertex_labels])
vertex_ids = gb_vertex_ids_and_labels.unique_keys[0]
vertex_labels = gb_vertex_ids_and_labels.unique_keys[1]
end = time.time()
dedup_and_sort_time = round(end-start,2)

arrays = vertex_ids.name + " " + vertex_labels.name + " " + label_mapper.name
start = time.time()
args = { "GraphName" : self.name,
"Arrays" : arrays,
}
rep_msg = generic_msg(cmd=cmd, args=args)
end = time.time()
add_into_data_structure_time = round(end-start,2)

return (label_id_time, internal_id_time, dedup_and_sort_time, add_into_data_structure_time)
def add_node_properties(self, node_properties:ak.DataFrame) -> None:
"""Populates the graph object with properties derived from the columns of a dataframe. Node
proprties are different from node labels where labels are always strings and can be
considered an extra identifier for different types of nodes. On the other hand, properties
are key-value pairs more akin to storing the columns of a dataframe.
Parameters
----------
properties : ak.DataFrame
`ak.DataFrame({"vertex_ids" : vertex_ids,
"property1" : property1, ..., "property2" : property2})`
See Also
--------
add_node_labels, add_edge_relationships, add_edge_properties
Notes
-----
Returns
-------
None
"""
cmd = "addNodeProperties"

### Preprocessing steps for faster back-end array population.
# 0. Extract the column names of the dataframe.
columns = node_properties.columns

# 1. Convert the vertex_ids to internal vertex_ids.
vertex_map = self.nodes()
vertex_ids = node_properties[columns[0]]
inds = ak.in1d(vertex_ids, vertex_map)
node_properties = node_properties[inds]
vertex_ids = node_properties[columns[0]]
vertex_ids = ak.find(vertex_ids, vertex_map)

# 2. Remove the first column name, vertex ids, since those are sent separately.
columns.remove(columns[0])
vertex_properties = ak.array(columns)

# 3. Extract symbol table names of arrays to use in the back-end.
data_array_names = []
for column in columns:
data_array_names.append(node_properties[column].name)
data_array_names = ak.array(data_array_names)

perm = ak.GroupBy(vertex_properties).permutation
vertex_properties = vertex_properties[perm]
data_array_names = data_array_names[perm]

args = { "GraphName" : self.name,
"VertexIdsName" : vertex_ids.name,
"PropertyMapperName" : vertex_properties.name,
"DataArrayNames" : data_array_names.name
}
rep_msg = generic_msg(cmd=cmd, args=args)

def add_edge_relationships(self, relationships:ak.DataFrame) -> None:
"""Populates the graph object with edge relationships from a dataframe. Passed dataframe
should follow the same format specified in the Parameters section below.
Parameters
----------
relationships
relationships : ak.DataFrame
`ak.DataFrame({"src" : src, "dst" : dst, "edge_relationships" : edge_relationships})`
Returns
Expand All @@ -633,11 +678,12 @@ def add_edge_relationships(self, relationships:ak.DataFrame) -> None:
"""
cmd = "addEdgeRelationships"

### Preprocessing steps for faster back-end array population.
### Preprocessing steps for faster back-end array population.
# 0. Extract the source and destination vertex ids and the relationships from the dataframe.
src_vertex_ids = relationships["src"]
dst_vertex_ids = relationships["dst"]
edge_relationships = relationships["edge_relationships"]
columns = relationships.columns
src_vertex_ids = relationships[columns[0]]
dst_vertex_ids = relationships[columns[1]]
edge_relationships = relationships[columns[2]]

# 1. Broadcast string relationship names to int values and extract the relationship str to
# int id map.
Expand Down Expand Up @@ -690,6 +736,24 @@ def get_node_labels(self) -> ak.Strings:

return ak.Strings.from_return_msg(rep_msg)

def get_node_properties(self) -> ak.Strings:
"""Returns the node properties of the
Parameters
----------
None
Returns
-------
Strings
The original labels inputted as strings.
"""
cmd = "getNodeProperties"
args = { "GraphName" : self.name }
rep_msg = generic_msg(cmd=cmd, args=args)

return ak.Strings.from_return_msg(rep_msg)

def get_edge_relationships(self) -> ak.Strings:
"""Returns the sorted object of edge relationships stored for the property graph.
Expand Down Expand Up @@ -726,7 +790,7 @@ def query_labels( self,
Returns
-------
pdarray
Vertex names that contain the specified nodes.
Vertex names that contain the nodes that match the query.
"""
cmd = "queryLabels"

Expand All @@ -748,6 +812,46 @@ def query_labels( self,

return final_vertices

def query_node_properties( self,
column:str, value,
op:str = "<" ) -> pdarray:
"""Given a property name, value, and operator, performs a query and returns the nodes that
match the query. Adhere to the operators accepted and ensure the values passed match the
same type of the property.
Parameters
----------
column : str
String specifying the column being search within.
op : str
Operator to apply to the search. Candidates vary and are listed below:
`int64`, `uint64`, `float64`: "<", ">", "<=", ">=", "==", "<>".
`bool`: "==", "<>".
`str`: "contains".
Returns
-------
pdarray
Vertex names that contain the nodes that match the query.
"""
cmd = "queryNodeProperties"

args = { "GraphName" : self.name,
"Column" : column,
"Value" : value,
"Op" : op }

rep_msg = generic_msg(cmd=cmd, args=args)

### Manipulate data to return the external vertex representations of the found nodes.
# 1. Convert Boolean array to actual pdarray.
vertices_bool = create_pdarray(rep_msg)

# 2. Use Boolean array to index original vertex names.
final_vertices:pdarray = self.nodes()[vertices_bool]

return final_vertices

def query_relationships( self,
relationships_to_find:pdarray,
op:str = "and" ) -> (pdarray,pdarray):
Expand Down
41 changes: 29 additions & 12 deletions arachne/server/GraphArray.chpl
Original file line number Diff line number Diff line change
Expand Up @@ -15,18 +15,22 @@ module GraphArray {

// Component key names to be stored stored in the components map for future retrieval
enum Component {
SRC, // The source array of every edge in the graph
DST, // The destination array of every edge in the graph
SEGMENTS, // The segments of adjacency lists for each vertex in DST
RANGES, // Keeps the range of the vertices the edge list stores per locale
EDGE_WEIGHT, // Stores the edge weights of the graph, if applicable
NODE_MAP, // Doing an index of NODE_MAP[u] gives you the original value of u
VERTEX_LABELS, // Any labels that belong to a specific node
VERTEX_LABELS_MAP, // Sorted array of vertex labels to integer id (array index)
EDGE_RELATIONSHIPS, // The relationships that belong to specific edges
EDGE_RELATIONSHIPS_MAP, //Sorted array of edge relationships to integer id (array index)
VERTEX_PROPS, // Any properties that belong to a specific node
EDGE_PROPS, // Any properties that belong to a specific edge
SRC, // The source array of every edge in the graph
DST, // The destination array of every edge in the graph
SEGMENTS, // The segments of adjacency lists for each vertex in DST
RANGES, // Keeps the range of the vertices the edge list stores per locale
EDGE_WEIGHT, // Stores the edge weights of the graph, if applicable
NODE_MAP, // Doing an index of NODE_MAP[u] gives you the original value of u
VERTEX_LABELS, // Any labels that belong to a specific node
VERTEX_LABELS_MAP, // Sorted array of vertex labels to integer id (array index)
EDGE_RELATIONSHIPS, // The relationships that belong to specific edges
EDGE_RELATIONSHIPS_MAP, // Sorted array of edge relationships to integer id (array index)
VERTEX_PROPS, // Any properties that belong to a specific node
VERTEX_PROPS_COL_MAP, // Sorted array of vertex property to integer id (array index)
VERTEX_PROPS_DTYPE_MAP, // Sorted array of column datatype to integer id (array index)
VERTEX_PROPS_COL2DTYPE, // Map of column names to the datatype of the column.
EDGE_PROPS, // Any properties that belong to a specific edge
EDGE_PROPS_MAP, // Sorted array of edge property to integer id (array index)
}

/**
Expand Down Expand Up @@ -104,6 +108,19 @@ module GraphArray {
}
}

class MapSymEntry : GenSymEntry {
var stored_map: map(string, string);

proc init(ref map_to_store: map(string, string)) {
super.init(string);
this.stored_map = map_to_store;
}
}

proc toMapSymEntry(e) {
return try! e : borrowed MapSymEntry;
}

proc toSymEntryAD(e) {
return try! e : borrowed SymEntryAD();
}
Expand Down
Loading

0 comments on commit 53366f4

Please sign in to comment.