Merge branch 'develop'

BBVA · Nov 21, 2024 · 008c46c · 008c46c
2 parents 86274e5 + 2f00ef6
commit 008c46c
Show file tree

Hide file tree

Showing 21 changed files with 240 additions and 151 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,4 +1,6 @@
 .vscode
+.coverage
+site
 **/__pycache__/
 **/checkpoint/
 **/tmp_checkpoint/
diff --git a/docs/index.md b/docs/index.md
@@ -0,0 +1,33 @@
+# mercury-graph
+
+**`mercury-graph`** is a Python library that offers **graph analytics capabilities with a technology-agnostic API**, allowing users to use a curated range of performant and scalable algorithms and utilities regardless of the technologies employed (pure Python, [Numba](https://numba.pydata.org/)-compiled, [**networkx**](https://networkx.org/), distributed Spark [**graphframes**](https://graphframes.github.io/graphframes/docs/_site/index.html), etc.).
+
+Currently implemented **submodules** in `mercury.graph` include:
+
+- [**`mercury.graph.core`**](reference/core.md), with the main classes of the library that create and store the graphs' data and properties.
+
+- [**`mercury.graph.ml`**](reference/ml.md), with graph theory and machine learning algorithms such as Louvain community detection, spectral clustering, Markov chains, spreading activation-based diffusion models and graph random walkers.
+
+- [**`mercury.graph.embeddings`**](reference/embeddings.md), with classes that calculate graph embeddings in different ways, such as following the Node2Vec algorithm.
+
+- [**`mercury.graph.viz`**](reference/viz.md), with capabilities for graph visualization.
+
+
+## Python installation
+
+The easiest way to install `mercury-graph` is using `pip`:
+
+```bash
+    pip install mercury-graph
+```
+
+### Repository
+
+The website for the GitHub repository can be found [here](https://github.com/BBVA/mercury-graph).
+
+## Help and support
+
+It is a part of [**`mercury`**](https://www.bbvaaifactory.com/mercury/), a collaborative library developed by the **Advanced Analytics community at BBVA** that offers a broad range of tools to simplify and accelerate data science workflows. This library was originally an Inner Source project, but some components, like `mercury.graph`, have been released as Open Source.
+
+  * [Mercury team](mailto:[email protected]?subject=[mercury-graph])
+  * [Issues](https://github.com/BBVA/mercury-graph/issues)
diff --git a/docs/reference/core.md b/docs/reference/core.md
@@ -0,0 +1,3 @@
+::: mercury.graph.core.Graph
+
+::: mercury.graph.core.SparkInterface
diff --git a/docs/reference/embeddings.md b/docs/reference/embeddings.md
@@ -0,0 +1,5 @@
+::: mercury.graph.embeddings.Embeddings
+
+::: mercury.graph.embeddings.GraphEmbedding
+
+::: mercury.graph.embeddings.SparkNode2Vec
diff --git a/docs/reference/ml.md b/docs/reference/ml.md
@@ -0,0 +1,9 @@
+::: mercury.graph.ml.LouvainCommunities
+
+::: mercury.graph.ml.SparkRandomWalker
+
+::: mercury.graph.ml.SparkSpreadingActivation
+
+::: mercury.graph.ml.SpectralClustering
+
+::: mercury.graph.ml.Transition
diff --git a/docs/reference/viz.md b/docs/reference/viz.md
@@ -0,0 +1 @@
+::: mercury.graph.viz.Moebius
diff --git a/docs/stylesheets/extra.css b/docs/stylesheets/extra.css
@@ -0,0 +1,11 @@
+[data-md-color-scheme="default"] {
+    --md-primary-fg-color:               #1973b8;
+    --md-primary-fg-color--light:        #1973b8;
+    --md-primary-fg-color--dark:         #1973b8;
+}
+
+[data-md-color-scheme="slate"] {
+    --md-primary-fg-color:               #1973b8;
+    --md-primary-fg-color--light:        #1973b8;
+    --md-primary-fg-color--dark:         #1973b8;
+}
diff --git a/mercury/graph/core/graph.py b/mercury/graph/core/graph.py
@@ -133,25 +133,30 @@ class Graph:
 
     The conventions can be found here:
 
-    - Scikit API: https://scikit-learn.org/stable/developers/develop.html#apis-of-scikit-learn-objects
-    - On scikit conventions: https://scikit-learn.org/stable/glossary.html
+    - [Scikit API](https://scikit-learn.org/stable/developers/develop.html#apis-of-scikit-learn-objects)
+    - [On scikit conventions](https://scikit-learn.org/stable/glossary.html)
 
     Args:
-        data: The data to create the graph from. It can be a pandas DataFrame, a networkx Graph, a pyspark DataFrame, or a Graphframe. In
-            case it already contains a graph (networkx or graphframes), the keys and nodes arguments are ignored.
-        keys: A dictionary with keys to specify the columns in the data DataFrame. The keys are:
+        data (pd.DataFrame, nx.Graph or pyspark.sql.DataFrame): The data to create the graph from. 
+            It can be a pandas DataFrame, a networkx Graph, a pyspark DataFrame, or a Graphframe. 
+            In case it already contains a graph (networkx or graphframes), the keys and nodes arguments are ignored.
+        keys (dict): A dictionary with keys to specify the columns in the data DataFrame. The keys are:
+
             - 'src': The name of the column with the source node.
             - 'dst': The name of the column with the destination node.
             - 'id': The name of the column with the node id.
             - 'weight': The name of the column with the edge weight.
             - 'directed': A boolean to specify if the graph is directed. (Only for pyspark DataFrames)
+
             When the keys argument is not provided or the key is missing, the default values are:
+
             - 'src': 'src'
             - 'dst': 'dst'
             - 'id': 'id'
             - 'weight': 'weight'
             - 'directed': True
-        nodes: A pandas DataFrame or a pyspark DataFrame with the nodes data. (Only when `data` is pandas or pyspark DataFrame and with the
+
+        nodes (pd.DataFrame): A pandas DataFrame or a pyspark DataFrame with the nodes data. (Only when `data` is pandas or pyspark DataFrame and with the
             same type as `data`) If not given, the nodes are inferred from the edges DataFrame.
     """
     def __init__(self, data = None, keys = None, nodes = None):
@@ -220,7 +225,7 @@ def nodes(self):
         Returns an iterator over all the nodes in the graph.
 
         Returns:
-            NodeIterator: An iterator that yields each node in the graph.
+            (NodeIterator): An iterator that yields each node in the graph.
         """
         return NodeIterator(self)
 
@@ -231,7 +236,7 @@ def edges(self):
         Returns an iterator over the edges in the graph.
 
         Returns:
-            EdgeIterator: An iterator object that allows iterating over the edges in the graph.
+            (EdgeIterator): An iterator object that allows iterating over the edges in the graph.
         """
         return EdgeIterator(self)
 
@@ -244,7 +249,7 @@ def networkx(self):
         If the graph has not been converted to NetworkX format yet, it will be converted and cached for future use.
 
         Returns:
-            networkx.Graph: The graph representation as a NetworkX graph.
+            (networkx.Graph): The graph representation as a NetworkX graph.
         """
         if self._as_networkx is None:
             self._as_networkx = self._to_networkx()
@@ -260,7 +265,7 @@ def graphframe(self):
         If the graph has not been converted to a GraphFrame yet, it will be converted and cached for future use.
 
         Returns:
-            GraphFrame: The graph represented as a GraphFrame.
+            (GraphFrame): The graph represented as a GraphFrame.
         """
         if self._as_graphframe is None:
             self._as_graphframe = self._to_graphframe()
@@ -276,7 +281,7 @@ def dgl(self):
         If the graph has not been converted to a DGL graph yet, it will be converted and cached for future use.
 
         Returns:
-            dgl.DGLGraph: The graph represented as a DGL graph.
+            (dgl.DGLGraph): The graph represented as a DGL graph.
         """
         if self._as_dgl is None:
             self._as_dgl = self._to_dgl()
@@ -380,7 +385,7 @@ def number_of_nodes(self):
         Returns the number of nodes in the graph.
 
         Returns:
-            int: The number of nodes in the graph.
+            (int): The number of nodes in the graph.
         """
         return self._number_of_nodes
 
@@ -391,7 +396,7 @@ def number_of_edges(self):
         Returns the number of edges in the graph.
 
         Returns:
-            int: The number of edges in the graph.
+            (int): The number of edges in the graph.
         """
         return self._number_of_edges
 
@@ -401,10 +406,11 @@ def is_directed(self):
         """
         Returns True if the graph is directed, False otherwise.
 
-        Note: Graphs created using graphframes are always directed. The way around it is to add the reverse edges to the graph.
-        This can be done by creating the Graph with pyspark DataFrame() and defining a key 'directed' set as False in the `dict`
-        argument. Otherwise, the graph will be considered directed even if these reversed edges have been created by other means
-        this class cannot be aware of.
+        Note: 
+            Graphs created using graphframes are always directed. The way around it is to add the reverse edges to the graph.
+            This can be done by creating the Graph with pyspark DataFrame() and defining a key 'directed' set as False in the `dict`
+            argument. Otherwise, the graph will be considered directed even if these reversed edges have been created by other means
+            this class cannot be aware of.
         """
         return self._is_directed
 

diff --git a/mercury/graph/embeddings/embeddings.py b/mercury/graph/embeddings/embeddings.py
@@ -41,10 +41,11 @@ class Embeddings(BaseClass):
     Note:
         **On dimension:** Embeddings cannot be zero (that is against the whole concept). Smaller dimension embeddings can only hold
         few elements without introducing spurious correlations by some form of 'birthday attack' phenomenon as elements increase. Later
-        it is very hard to get rid of that spurious 'knowledge'. **Solution**: With may elements, you have to go to high enough dimension
-        even if the structure is simple. Pretending to fit many embeddings in low dimension without them being correlated is like
-        pretending to plot a trillion random points in a square centimeter while keeping them 1 mm apart from each other: It's simply
-        impossible!
+        it is very hard to get rid of that spurious 'knowledge'. 
+
+        **Solution**: With may elements, you have to go to high enough dimension even if the structure is simple. 
+        Pretending to fit many embeddings in low dimension without them being correlated is like pretending to plot a trillion random 
+        points in a square centimeter while keeping them 1 mm apart from each other: It's simply impossible!
     """
 
     def __init__(

diff --git a/mercury/graph/embeddings/graphembeddings.py b/mercury/graph/embeddings/graphembeddings.py
@@ -122,7 +122,7 @@ def __getitem__(self, arg):
             (numpy.matrix): A numpy matrix of one row
 
         """
-        return self.graph_embedding_.embeddings_matrix_[self.node_ids.index(arg)]
+        return self.embeddings_.embeddings_matrix_[self.node_ids.index(arg)]
 
     def fit(self, g: Graph):
         """
@@ -190,13 +190,13 @@ def fit(self, g: Graph):
             self.max_per_epoch if self.max_per_epoch is not None else self.n_jumps,
         )
 
-        self.graph_embedding_ = Embeddings(
+        self.embeddings_ = Embeddings(
             dimension=self.dimension,
             num_elements=len(self.node_ids),
             learn_step=self.learn_step,
             bidirectional=self.bidirectional,
         )
-        self.graph_embedding_.fit(converge, diverge)
+        self.embeddings_.fit(converge, diverge)
 
         return self
 
@@ -205,12 +205,12 @@ def embedding(self):
         Return the internal Embeddings object.
 
         Returns:
-            The embedding which is a dense matrix of `float` that can be used with `numpy` functions.
+            (mercury.graph.embeddings.Embeddings): The embedding which is a dense matrix of `float` that can be used with `numpy` functions.
         """
-        if not hasattr(self, "graph_embedding_"):
+        if not hasattr(self, "embeddings_"):
             return
 
-        return self.graph_embedding_
+        return self.embeddings_
 
     def get_most_similar_nodes(
         self, node_id, k=5, metric="cosine", return_as_indices=False
@@ -222,7 +222,7 @@ def get_most_similar_nodes(
             node_id (object): Id of the node that we want to search the similar nodes.
             k (int): Number of most similar nodes to return
             metric (str): metric to use as a similarity.
-            return_as_indices (bool): if return the nodes as indices(False), or as node ids (True)
+            return_as_indices (bool): if return the nodes as indices (False), or as node ids (True)
 
         Returns:
             (list): list of k most similar nodes and list of similarities of the most similar nodes
@@ -231,7 +231,7 @@ def get_most_similar_nodes(
         node_index = self.node_ids.index(node_id)
 
         ordered_indices, ordered_similarities = (
-            self.graph_embedding_.get_most_similar_embeddings(node_index, k, metric)
+            self.embeddings_.get_most_similar_embeddings(node_index, k, metric)
         )
 
         if not return_as_indices:
@@ -255,7 +255,7 @@ def save(self, file_name, save_embedding=False):
         with bz2.BZ2File(file_name, "w") as f:
             pickle.dump(GraphEmbedding.FILE_HEAD, f)
             pickle.dump(save_embedding, f)
-            pickle.dump(self.graph_embedding_.dimension, f)
+            pickle.dump(self.embeddings_.dimension, f)
 
             pickle.dump(self.node_ids, f)
 
@@ -268,7 +268,7 @@ def save(self, file_name, save_embedding=False):
             pickle.dump(self.TotW, f)
 
             if save_embedding:
-                np.save(f, self.graph_embedding_.embeddings_matrix_)
+                np.save(f, self.embeddings_.embeddings_matrix_)
 
             pickle.dump(GraphEmbedding.FILE_END, f)
 
@@ -296,10 +296,10 @@ def _load(self, file_name):
 
             self.TotW = pickle.load(f)
 
-            self.graph_embedding_ = Embeddings(dimension, len(self.node_ids))
+            self.embeddings_ = Embeddings(dimension, len(self.node_ids))
 
             if has_emb:
-                self.graph_embedding_.embeddings_matrix_ = np.load(f)
+                self.embeddings_.embeddings_matrix_ = np.load(f)
 
             end = pickle.load(f)
 

diff --git a/mercury/graph/embeddings/spark_node2vec.py b/mercury/graph/embeddings/spark_node2vec.py
@@ -72,11 +72,11 @@ def __init__(
             w2v_step_size (float): This is the Spark Word2Vec parameter stepSize, the default value is the original default value.
             w2v_min_count (int): This is the Spark Word2Vec parameter minCount, the default value is the original default value (5). Is the
                 minimum number of times that a node has to appear to generate an embedding.
-            path_cache (str): folder where random walks will be stored, the default value is None which entails that random walks will not
+            path_cache (str): Folder where random walks will be stored, the default value is None which entails that random walks will not
                 be stored.
-            use_cached_rw (bool): flag that indicates if random walks should be read from disk (hence, they will not be computed again).
+            use_cached_rw (bool): Flag that indicates if random walks should be read from disk (hence, they will not be computed again).
                 Setting this parameter to True requires a valid path_cache.
-            n_partitions_cache (int): number of partitions that will be used when storing the random walks, to optimize read access.
+            n_partitions_cache (int): Number of partitions that will be used when storing the random walks, to optimize read access.
                 The default value is 10.
             load_file (str): (optional) The full path to a parquet file containing a serialized SparkNode2Vec object. This file must be created
                 using SparkNode2Vec.save().
@@ -117,15 +117,14 @@ def fit(self, G: Graph):
         """
         Train the embedding by doing random walks.
 
+        Random walk paths are available in attribute `paths_`.
+
         Args:
-            G (mercury.graph Graph asset): A `mercury.graph` Graph object. The embedding will be created so that each row in the embedding maps
-            a node ID in G. (This parameter will be ignored when `load_file` is used.)
+            G (mercury.graph.core.Graph): A `mercury.graph` Graph object. The embedding will be created so that each row in the embedding maps
+                a node ID in G. (This parameter will be ignored when `load_file` is used.)
 
         Returns:
-            self (object): Fitted self (or raises an error)
-
-        Random walk paths are available in attribute `paths_`.
-        Spark's Word2Vec model fitted on paths_ is available in attribute `node2vec_` through method `model()`.
+            (self): Fitted self (or raises an error)
         """
 
         if self.path_cache is None:
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		::: mercury.graph.core.Graph

		::: mercury.graph.core.SparkInterface