diff --git a/.gitignore b/.gitignore index 908d653..73733b2 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,6 @@ .vscode +.coverage +site **/__pycache__/ **/checkpoint/ **/tmp_checkpoint/ diff --git a/docs/index.md b/docs/index.md new file mode 100644 index 0000000..3067aab --- /dev/null +++ b/docs/index.md @@ -0,0 +1,33 @@ +# mercury-graph + +**`mercury-graph`** is a Python library that offers **graph analytics capabilities with a technology-agnostic API**, allowing users to use a curated range of performant and scalable algorithms and utilities regardless of the technologies employed (pure Python, [Numba](https://numba.pydata.org/)-compiled, [**networkx**](https://networkx.org/), distributed Spark [**graphframes**](https://graphframes.github.io/graphframes/docs/_site/index.html), etc.). + +Currently implemented **submodules** in `mercury.graph` include: + +- [**`mercury.graph.core`**](reference/core.md), with the main classes of the library that create and store the graphs' data and properties. + +- [**`mercury.graph.ml`**](reference/ml.md), with graph theory and machine learning algorithms such as Louvain community detection, spectral clustering, Markov chains, spreading activation-based diffusion models and graph random walkers. + +- [**`mercury.graph.embeddings`**](reference/embeddings.md), with classes that calculate graph embeddings in different ways, such as following the Node2Vec algorithm. + +- [**`mercury.graph.viz`**](reference/viz.md), with capabilities for graph visualization. + + +## Python installation + +The easiest way to install `mercury-graph` is using `pip`: + +```bash + pip install mercury-graph +``` + +### Repository + +The website for the GitHub repository can be found [here](https://github.com/BBVA/mercury-graph). + +## Help and support + +It is a part of [**`mercury`**](https://www.bbvaaifactory.com/mercury/), a collaborative library developed by the **Advanced Analytics community at BBVA** that offers a broad range of tools to simplify and accelerate data science workflows. This library was originally an Inner Source project, but some components, like `mercury.graph`, have been released as Open Source. + + * [Mercury team](mailto:mercury.group@bbva.com?subject=[mercury-graph]) + * [Issues](https://github.com/BBVA/mercury-graph/issues) \ No newline at end of file diff --git a/docs/reference/core.md b/docs/reference/core.md new file mode 100644 index 0000000..96ec816 --- /dev/null +++ b/docs/reference/core.md @@ -0,0 +1,3 @@ +::: mercury.graph.core.Graph + +::: mercury.graph.core.SparkInterface \ No newline at end of file diff --git a/docs/reference/embeddings.md b/docs/reference/embeddings.md new file mode 100644 index 0000000..ce5d338 --- /dev/null +++ b/docs/reference/embeddings.md @@ -0,0 +1,5 @@ +::: mercury.graph.embeddings.Embeddings + +::: mercury.graph.embeddings.GraphEmbedding + +::: mercury.graph.embeddings.SparkNode2Vec \ No newline at end of file diff --git a/docs/reference/ml.md b/docs/reference/ml.md new file mode 100644 index 0000000..33a7ff0 --- /dev/null +++ b/docs/reference/ml.md @@ -0,0 +1,9 @@ +::: mercury.graph.ml.LouvainCommunities + +::: mercury.graph.ml.SparkRandomWalker + +::: mercury.graph.ml.SparkSpreadingActivation + +::: mercury.graph.ml.SpectralClustering + +::: mercury.graph.ml.Transition \ No newline at end of file diff --git a/docs/reference/viz.md b/docs/reference/viz.md new file mode 100644 index 0000000..9171857 --- /dev/null +++ b/docs/reference/viz.md @@ -0,0 +1 @@ +::: mercury.graph.viz.Moebius \ No newline at end of file diff --git a/docs/stylesheets/extra.css b/docs/stylesheets/extra.css new file mode 100644 index 0000000..019f67a --- /dev/null +++ b/docs/stylesheets/extra.css @@ -0,0 +1,11 @@ +[data-md-color-scheme="default"] { + --md-primary-fg-color: #1973b8; + --md-primary-fg-color--light: #1973b8; + --md-primary-fg-color--dark: #1973b8; +} + +[data-md-color-scheme="slate"] { + --md-primary-fg-color: #1973b8; + --md-primary-fg-color--light: #1973b8; + --md-primary-fg-color--dark: #1973b8; +} \ No newline at end of file diff --git a/mercury/graph/core/graph.py b/mercury/graph/core/graph.py index 8064445..280e853 100644 --- a/mercury/graph/core/graph.py +++ b/mercury/graph/core/graph.py @@ -133,25 +133,30 @@ class Graph: The conventions can be found here: - - Scikit API: https://scikit-learn.org/stable/developers/develop.html#apis-of-scikit-learn-objects - - On scikit conventions: https://scikit-learn.org/stable/glossary.html + - [Scikit API](https://scikit-learn.org/stable/developers/develop.html#apis-of-scikit-learn-objects) + - [On scikit conventions](https://scikit-learn.org/stable/glossary.html) Args: - data: The data to create the graph from. It can be a pandas DataFrame, a networkx Graph, a pyspark DataFrame, or a Graphframe. In - case it already contains a graph (networkx or graphframes), the keys and nodes arguments are ignored. - keys: A dictionary with keys to specify the columns in the data DataFrame. The keys are: + data (pd.DataFrame, nx.Graph or pyspark.sql.DataFrame): The data to create the graph from. + It can be a pandas DataFrame, a networkx Graph, a pyspark DataFrame, or a Graphframe. + In case it already contains a graph (networkx or graphframes), the keys and nodes arguments are ignored. + keys (dict): A dictionary with keys to specify the columns in the data DataFrame. The keys are: + - 'src': The name of the column with the source node. - 'dst': The name of the column with the destination node. - 'id': The name of the column with the node id. - 'weight': The name of the column with the edge weight. - 'directed': A boolean to specify if the graph is directed. (Only for pyspark DataFrames) + When the keys argument is not provided or the key is missing, the default values are: + - 'src': 'src' - 'dst': 'dst' - 'id': 'id' - 'weight': 'weight' - 'directed': True - nodes: A pandas DataFrame or a pyspark DataFrame with the nodes data. (Only when `data` is pandas or pyspark DataFrame and with the + + nodes (pd.DataFrame): A pandas DataFrame or a pyspark DataFrame with the nodes data. (Only when `data` is pandas or pyspark DataFrame and with the same type as `data`) If not given, the nodes are inferred from the edges DataFrame. """ def __init__(self, data = None, keys = None, nodes = None): @@ -220,7 +225,7 @@ def nodes(self): Returns an iterator over all the nodes in the graph. Returns: - NodeIterator: An iterator that yields each node in the graph. + (NodeIterator): An iterator that yields each node in the graph. """ return NodeIterator(self) @@ -231,7 +236,7 @@ def edges(self): Returns an iterator over the edges in the graph. Returns: - EdgeIterator: An iterator object that allows iterating over the edges in the graph. + (EdgeIterator): An iterator object that allows iterating over the edges in the graph. """ return EdgeIterator(self) @@ -244,7 +249,7 @@ def networkx(self): If the graph has not been converted to NetworkX format yet, it will be converted and cached for future use. Returns: - networkx.Graph: The graph representation as a NetworkX graph. + (networkx.Graph): The graph representation as a NetworkX graph. """ if self._as_networkx is None: self._as_networkx = self._to_networkx() @@ -260,7 +265,7 @@ def graphframe(self): If the graph has not been converted to a GraphFrame yet, it will be converted and cached for future use. Returns: - GraphFrame: The graph represented as a GraphFrame. + (GraphFrame): The graph represented as a GraphFrame. """ if self._as_graphframe is None: self._as_graphframe = self._to_graphframe() @@ -276,7 +281,7 @@ def dgl(self): If the graph has not been converted to a DGL graph yet, it will be converted and cached for future use. Returns: - dgl.DGLGraph: The graph represented as a DGL graph. + (dgl.DGLGraph): The graph represented as a DGL graph. """ if self._as_dgl is None: self._as_dgl = self._to_dgl() @@ -380,7 +385,7 @@ def number_of_nodes(self): Returns the number of nodes in the graph. Returns: - int: The number of nodes in the graph. + (int): The number of nodes in the graph. """ return self._number_of_nodes @@ -391,7 +396,7 @@ def number_of_edges(self): Returns the number of edges in the graph. Returns: - int: The number of edges in the graph. + (int): The number of edges in the graph. """ return self._number_of_edges @@ -401,10 +406,11 @@ def is_directed(self): """ Returns True if the graph is directed, False otherwise. - Note: Graphs created using graphframes are always directed. The way around it is to add the reverse edges to the graph. - This can be done by creating the Graph with pyspark DataFrame() and defining a key 'directed' set as False in the `dict` - argument. Otherwise, the graph will be considered directed even if these reversed edges have been created by other means - this class cannot be aware of. + Note: + Graphs created using graphframes are always directed. The way around it is to add the reverse edges to the graph. + This can be done by creating the Graph with pyspark DataFrame() and defining a key 'directed' set as False in the `dict` + argument. Otherwise, the graph will be considered directed even if these reversed edges have been created by other means + this class cannot be aware of. """ return self._is_directed diff --git a/mercury/graph/embeddings/embeddings.py b/mercury/graph/embeddings/embeddings.py index 55f797e..5da6458 100644 --- a/mercury/graph/embeddings/embeddings.py +++ b/mercury/graph/embeddings/embeddings.py @@ -41,10 +41,11 @@ class Embeddings(BaseClass): Note: **On dimension:** Embeddings cannot be zero (that is against the whole concept). Smaller dimension embeddings can only hold few elements without introducing spurious correlations by some form of 'birthday attack' phenomenon as elements increase. Later - it is very hard to get rid of that spurious 'knowledge'. **Solution**: With may elements, you have to go to high enough dimension - even if the structure is simple. Pretending to fit many embeddings in low dimension without them being correlated is like - pretending to plot a trillion random points in a square centimeter while keeping them 1 mm apart from each other: It's simply - impossible! + it is very hard to get rid of that spurious 'knowledge'. + + **Solution**: With may elements, you have to go to high enough dimension even if the structure is simple. + Pretending to fit many embeddings in low dimension without them being correlated is like pretending to plot a trillion random + points in a square centimeter while keeping them 1 mm apart from each other: It's simply impossible! """ def __init__( diff --git a/mercury/graph/embeddings/graphembeddings.py b/mercury/graph/embeddings/graphembeddings.py index 82d0b66..ce2c2f6 100644 --- a/mercury/graph/embeddings/graphembeddings.py +++ b/mercury/graph/embeddings/graphembeddings.py @@ -122,7 +122,7 @@ def __getitem__(self, arg): (numpy.matrix): A numpy matrix of one row """ - return self.graph_embedding_.embeddings_matrix_[self.node_ids.index(arg)] + return self.embeddings_.embeddings_matrix_[self.node_ids.index(arg)] def fit(self, g: Graph): """ @@ -190,13 +190,13 @@ def fit(self, g: Graph): self.max_per_epoch if self.max_per_epoch is not None else self.n_jumps, ) - self.graph_embedding_ = Embeddings( + self.embeddings_ = Embeddings( dimension=self.dimension, num_elements=len(self.node_ids), learn_step=self.learn_step, bidirectional=self.bidirectional, ) - self.graph_embedding_.fit(converge, diverge) + self.embeddings_.fit(converge, diverge) return self @@ -205,12 +205,12 @@ def embedding(self): Return the internal Embeddings object. Returns: - The embedding which is a dense matrix of `float` that can be used with `numpy` functions. + (mercury.graph.embeddings.Embeddings): The embedding which is a dense matrix of `float` that can be used with `numpy` functions. """ - if not hasattr(self, "graph_embedding_"): + if not hasattr(self, "embeddings_"): return - return self.graph_embedding_ + return self.embeddings_ def get_most_similar_nodes( self, node_id, k=5, metric="cosine", return_as_indices=False @@ -222,7 +222,7 @@ def get_most_similar_nodes( node_id (object): Id of the node that we want to search the similar nodes. k (int): Number of most similar nodes to return metric (str): metric to use as a similarity. - return_as_indices (bool): if return the nodes as indices(False), or as node ids (True) + return_as_indices (bool): if return the nodes as indices (False), or as node ids (True) Returns: (list): list of k most similar nodes and list of similarities of the most similar nodes @@ -231,7 +231,7 @@ def get_most_similar_nodes( node_index = self.node_ids.index(node_id) ordered_indices, ordered_similarities = ( - self.graph_embedding_.get_most_similar_embeddings(node_index, k, metric) + self.embeddings_.get_most_similar_embeddings(node_index, k, metric) ) if not return_as_indices: @@ -255,7 +255,7 @@ def save(self, file_name, save_embedding=False): with bz2.BZ2File(file_name, "w") as f: pickle.dump(GraphEmbedding.FILE_HEAD, f) pickle.dump(save_embedding, f) - pickle.dump(self.graph_embedding_.dimension, f) + pickle.dump(self.embeddings_.dimension, f) pickle.dump(self.node_ids, f) @@ -268,7 +268,7 @@ def save(self, file_name, save_embedding=False): pickle.dump(self.TotW, f) if save_embedding: - np.save(f, self.graph_embedding_.embeddings_matrix_) + np.save(f, self.embeddings_.embeddings_matrix_) pickle.dump(GraphEmbedding.FILE_END, f) @@ -296,10 +296,10 @@ def _load(self, file_name): self.TotW = pickle.load(f) - self.graph_embedding_ = Embeddings(dimension, len(self.node_ids)) + self.embeddings_ = Embeddings(dimension, len(self.node_ids)) if has_emb: - self.graph_embedding_.embeddings_matrix_ = np.load(f) + self.embeddings_.embeddings_matrix_ = np.load(f) end = pickle.load(f) diff --git a/mercury/graph/embeddings/spark_node2vec.py b/mercury/graph/embeddings/spark_node2vec.py index e806318..95c2505 100644 --- a/mercury/graph/embeddings/spark_node2vec.py +++ b/mercury/graph/embeddings/spark_node2vec.py @@ -72,11 +72,11 @@ def __init__( w2v_step_size (float): This is the Spark Word2Vec parameter stepSize, the default value is the original default value. w2v_min_count (int): This is the Spark Word2Vec parameter minCount, the default value is the original default value (5). Is the minimum number of times that a node has to appear to generate an embedding. - path_cache (str): folder where random walks will be stored, the default value is None which entails that random walks will not + path_cache (str): Folder where random walks will be stored, the default value is None which entails that random walks will not be stored. - use_cached_rw (bool): flag that indicates if random walks should be read from disk (hence, they will not be computed again). + use_cached_rw (bool): Flag that indicates if random walks should be read from disk (hence, they will not be computed again). Setting this parameter to True requires a valid path_cache. - n_partitions_cache (int): number of partitions that will be used when storing the random walks, to optimize read access. + n_partitions_cache (int): Number of partitions that will be used when storing the random walks, to optimize read access. The default value is 10. load_file (str): (optional) The full path to a parquet file containing a serialized SparkNode2Vec object. This file must be created using SparkNode2Vec.save(). @@ -117,15 +117,14 @@ def fit(self, G: Graph): """ Train the embedding by doing random walks. + Random walk paths are available in attribute `paths_`. + Args: - G (mercury.graph Graph asset): A `mercury.graph` Graph object. The embedding will be created so that each row in the embedding maps - a node ID in G. (This parameter will be ignored when `load_file` is used.) + G (mercury.graph.core.Graph): A `mercury.graph` Graph object. The embedding will be created so that each row in the embedding maps + a node ID in G. (This parameter will be ignored when `load_file` is used.) Returns: - self (object): Fitted self (or raises an error) - - Random walk paths are available in attribute `paths_`. - Spark's Word2Vec model fitted on paths_ is available in attribute `node2vec_` through method `model()`. + (self): Fitted self (or raises an error) """ if self.path_cache is None: diff --git a/mercury/graph/ml/louvain.py b/mercury/graph/ml/louvain.py index 22f6431..0d2f4fd 100644 --- a/mercury/graph/ml/louvain.py +++ b/mercury/graph/ml/louvain.py @@ -4,21 +4,6 @@ This module constitutes a PySpark implementation of the Louvain algorithm for community detection. The algorithm aims to find the partition of a graph that yields the maximum modularity. - -This version of the algorithm differs from [1]_ in that the reassignment of -nodes to new communities is calculated in parallel, not sequentially. That is, -all nodes are reassigned at the same time and conflicts (i.e., 1 -> C2 and -2 -> C1) are resolved with a simple tie-breaking rule. - -References ----------- -[1] Blondel V D, Guillaume J-L, Lambiotte R and Lefebvre E (2008). Fast - unfolding of communities in large networks. Journal of Statistical - Mechanics: Theory and Experiment, 2008. - - -[2] Aynaud T, Blondel V D, Guillaume J-L and Lambiotte R (2013). Multilevel - local optimization of modularity. Graph Partitioning (315--345), 2013. """ from mercury.graph.core.base import BaseClass @@ -31,13 +16,30 @@ class LouvainCommunities(BaseClass): """ - Class that defines the functions that run the Louvain algorithm to find the - partition that maximizes the modularity of an undirected graph (as in [1]_). + Class that defines the functions that run a PySpark implementation of the + Louvain algorithm to find the partition that maximizes the modularity of an + undirected graph (as in [^1]). + + This version of the algorithm differs from [^1] in that the reassignment of + nodes to new communities is calculated in parallel, not sequentially. That is, + all nodes are reassigned at the same time and conflicts (i.e., 1 -> C2 and + 2 -> C1) are resolved with a simple tie-breaking rule. This version also + introduces the resolution parameter _gamma_, as in [^2]. + + [^1]: + Blondel V D, Guillaume J-L, Lambiotte R and Lefebvre E (2008). Fast + unfolding of communities in large networks. Journal of Statistical + Mechanics: Theory and Experiment, 2008. + + + [^2]: + Aynaud T, Blondel V D, Guillaume J-L and Lambiotte R (2013). Multilevel + local optimization of modularity. Graph Partitioning (315--345), 2013. Args: min_modularity_gain (float): - Modularity gain threshold between each pass (as in [1]_). The - algorithm stops if the gain in modularity between the current pass + Modularity gain threshold between each pass. The algorithm + stops if the gain in modularity between the current pass and the previous one is less than the given threshold. max_pass (int): @@ -47,7 +49,7 @@ class LouvainCommunities(BaseClass): Maximum number of iterations within each pass. resolution (float): - The resolution parameter _gamma_ as introduced in [2]_. Its value + The resolution parameter _gamma_. Its value must be greater or equal to zero. If resolution is less than 1, modularity favors larger communities, while values greater than 1 favor smaller communities. @@ -100,10 +102,10 @@ def __str__(self): def fit(self, g: Graph): """ Args: - graph (Graph): A mercury graph structure. + g (Graph): A mercury graph structure. Returns: - self (object): Fitted self (or raises an error). + (self): Fitted self (or raises an error). """ edges = g.graphframe.edges @@ -219,7 +221,7 @@ def fit(self, g: Graph): return self def _verify_data(self, df, expected_cols_grouping, expected_cols_others): - """Checks if `edges` meets the format expected by `louvainCommunities`. + """Checks if `edges` meets the format expected by `LouvainCommunities`. Args: edges (pyspark.sql.dataframe.DataFrame): @@ -255,7 +257,7 @@ def _last_pass(self, df): Args: df (pyspark.sql.dataframe.DataFrame): A pyspark dataframe representing the series of partitions made by - `louvainCommunities` (a dataframe with columns 'id', 'pass0', + `LouvainCommunities` (a dataframe with columns 'id', 'pass0', 'pass1', 'pass2', 'pass3', etc.). """ @@ -271,8 +273,7 @@ def _last_pass(self, df): def _label_degrees(self, edges, partition): """ This function uses the edges of a graph to calculate the weighted degrees - of each node (as in [1]_) and joins the result with the partition passed by - the user. + of each node and joins the result with the partition passed by the user. Args: edges (pyspark.sql.dataframe.DataFrame): @@ -292,16 +293,6 @@ def _label_degrees(self, edges, partition): This function returns a dataframe with columns `id` (representing the ID of each node in the graph), `c` (representing each node's community) and `degree` (representing each node's degree). - - Example: - ```python - >>> labelDegrees(edges, partition).show() - | id| c|degree| - |---|---|------| - | 0| 2| 1| - | 1| 2| 2| - | 2| 2| 2| - ``` """ # Get id, community and weighted degree @@ -342,20 +333,10 @@ def _label_edges(self, edges, partition): node's ID) and `c` (indicating each node's assigned community). Returns: - edges (pyspark.sql.dataframe.DataFrame): + (pyspark.sql.dataframe.DataFrame): This function returns `edges` with two additional columns: the community that the source node belongs to (`cSrc`) and the community that the destination node belongs to (`cDst`). - - Example: - ```python - >>> labelEdges(graph, partition).show() - |dst|src|weight|cSrc|cDst| - |---|---|------|----|----| - | 0| 1| 1| 2| 2| - | 6| 7| 1| 6| 6| - | 6| 8| 1| 6| 6| - ``` """ # Get communities @@ -380,7 +361,7 @@ def _label_edges(self, edges, partition): return ret def _calculate_m(self, edges) -> int: - """Get the weighted size of an undirected graph (as in [1]_, where $m$ is + """Get the weighted size of an undirected graph (where $m$ is defined as $m = \\frac{1}{2} \\sum_{ij} A_{ij}$)). Args: @@ -415,13 +396,13 @@ def _calculate_modularity(self, edges, partition, m=None) -> float: node's ID) and `c` (indicating each node's assigned community). resolution (float): - The resolution parameter _gamma_ as introduced in [2]_. Its value + The resolution parameter _gamma_. Its value must be greater or equal to zero. If resolution is less than 1, modularity favors larger communities, while values greater than 1 favor smaller communities. (int): - The weighted size of the graph (the output of `getM()`). + The weighted size of the graph (the output of `_get_m()`). Returns: (float): @@ -585,16 +566,16 @@ def _reassign_all(self, edges, partition, m=None): return dq def _sort_passes(self, res) -> list: - """Takes the output of `louvainCommunities` and returns a list containing + """Takes the output of `LouvainCommunities` and returns a list containing its columns ordered by their integer part in ascending order. - For example, if the columns returned by `louvainCommunities are + For example, if the columns returned by `LouvainCommunities are `['pass2', 'id', 'pass1', 'pass0']`, this function will turn the list to `['id', 'pass0', 'pass1', 'pass2']`. - This function also supports cases where `maxPass > 10`. + This function also supports cases where `max_pass > 10`. Args: res (pyspark.sql.dataframe.DataFrame): - A pyspark dataframe representing the output of `louvainCommunities`. + A pyspark dataframe representing the output of `LouvainCommunities`. `res` must have columns 'id', 'pass0', 'pass1', 'pass2', etc. """ diff --git a/mercury/graph/ml/spark_randomwalker.py b/mercury/graph/ml/spark_randomwalker.py index 1b3c83c..516d583 100644 --- a/mercury/graph/ml/spark_randomwalker.py +++ b/mercury/graph/ml/spark_randomwalker.py @@ -56,7 +56,7 @@ def fit(self, G: Graph, source_id): source_id (int/str/list): the source vertex or list for vertices to start the random walks. Returns: - self (object): Fitted self (or raises an error) + (self): Fitted self (or raises an error) Attribute `paths_` contains a Spark Dataframe with a columns `random_walks` containing an array of the elements of the path walked and another column with the corresponding weights. The weights represent the probability of diff --git a/mercury/graph/ml/spark_spreadactivation.py b/mercury/graph/ml/spark_spreadactivation.py index 1b7a02e..e7f84d1 100644 --- a/mercury/graph/ml/spark_spreadactivation.py +++ b/mercury/graph/ml/spark_spreadactivation.py @@ -35,13 +35,14 @@ class SparkSpreadingActivation(BaseClass): """ This class is a model that represents a “word-of-mouth” scenario where a node influences his neighbors, from where the influence spreads to other neighbors, and so on. + At the end of the diffusion process, we inspect the amount of influence received by each node. Using a threshold-based technique, a node that is currently not influenced can be declared to be a potential future one, based on the influence that has been accumulated. + The diffusion model is based on Spreading Activation (SPA) techniques proposed in cognitive psychology - and later used for trust metric computations. - For more details, please see paper entitled "Social Ties and their Relevance to Churn in Mobile Telecom Networks" - (https://pdfs.semanticscholar.org/3275/3d80adb5ec2d4a974b5d1a872e2c957b263b.pdf) + and later used for trust metric computations. For more details, please see paper entitled + ["Social Ties and their Relevance to Churn in Mobile Telecom Networks"](https://pdfs.semanticscholar.org/3275/3d80adb5ec2d4a974b5d1a872e2c957b263b.pdf) Args: attribute (str): Column name which will store the amount of influence spread @@ -80,8 +81,9 @@ def fit( ): """ Perform all iterations of spread_activation + Args: - G (mercury.graph.core.Graph): A `mercury.graph` Graph object. + g (mercury.graph.core.Graph): A `mercury.graph` Graph object. seed_nodes (Union[List, pyspark.sql.DataFrame]): Collection of nodes that are the "seed" or are the source to spread the influence. It must be pyspark dataframe with column 'id' or python list @@ -117,6 +119,7 @@ def _set_seed_nodes( ): """ Set seed nodes which are the source of influence using pyspark dataframe. + Args: G (mercury.graph.core.Graph): A `mercury.graph` Graph object. seed_nodes (Union[List, pyspark.sql.DataFrame]): Collection of nodes that are the source to spread @@ -165,6 +168,7 @@ def _compute_degrees(self, g: Graph): """ Compute weighted and unweighted in and out degrees in graph. Re-declares graph to add the following attributes: inDegree, outDegree, w_inDegree, w_outDegree. + Args: - graph: graphframe object, network """ @@ -196,6 +200,7 @@ def _compute_degrees(self, g: Graph): def _spread_activation_step(self, g: Graph): """ One step in the spread activation model. + Args: graph: graphframe object, network attribute: str, name of column for attribute/influence @@ -203,7 +208,7 @@ def _spread_activation_step(self, g: Graph): transfer_function: weighted or unweighted, how to transfer influence along edges Returns: - graphframe object, new network with updated new calculation of attribute in vertices + (Graphframe): new network with updated new calculation of attribute in vertices """ # Pass influence/message to neighboring nodes (weighted/unweighted option) diff --git a/mercury/graph/ml/spectral.py b/mercury/graph/ml/spectral.py index 974cc3d..3b15731 100644 --- a/mercury/graph/ml/spectral.py +++ b/mercury/graph/ml/spectral.py @@ -4,7 +4,7 @@ from networkx import normalized_laplacian_matrix from networkx.algorithms.community import modularity as nx_modularity from sklearn.cluster import KMeans -from numpy.linalg import eig +from numpy.linalg import eigh from numpy import asarray import numpy as np @@ -15,7 +15,7 @@ class SpectralClustering(BaseClass): Args: n_clusters (int): The number of clusters that you want to detect. - random_state (int): seed for reproducibility + random_state (int): Seed for reproducibility mode (str): Calculation mode. Pass 'networkx' for using pandas + networkx or 'spark' for spark + graphframes max_iterations (int): Max iterations parameter (only used if mode==spark) @@ -55,7 +55,7 @@ def fit(self, graph: Graph): graph (Graph): A mercury graph structure. Returns: - self (object): Fitted self (or raises an error) + (self): Fitted self (or raises an error) """ if self.mode == "networkx": @@ -73,13 +73,16 @@ def _fit_networkx(self, graph: Graph): graph (Graph): A mercury graph structure. Returns: - self (object): Fitted self (or raises an error) + (self): Fitted self (or raises an error) """ gnx = graph.networkx.to_undirected() - L = normalized_laplacian_matrix(gnx) + L = normalized_laplacian_matrix(gnx).todense() - w, v = eig(L.todense()) + if not np.allclose(L, L.T): + raise ValueError("Normalized Laplacian matrix of the undirected graph should be symmetric") + + w, v = eigh(L) U = v[:, : self.n_clusters] U = asarray(U) @@ -101,7 +104,7 @@ def _fit_spark(self, graph: Graph): graph (Graph): A mercury graph structure. Returns: - self (object): Fitted self (or raises an error) + (self): Fitted self (or raises an error) """ from pyspark.sql import functions as F diff --git a/mercury/graph/ml/transition.py b/mercury/graph/ml/transition.py index e85ea27..a54a454 100644 --- a/mercury/graph/ml/transition.py +++ b/mercury/graph/ml/transition.py @@ -13,11 +13,6 @@ class Transition(BaseClass): """ Create an interface class to manage the adjacency matrix of a directed graph as a transition matrix. This enables computing distributions of probabilities over the nodes after a given number of iterations. - - Args - fitted_graph_ (Graph): A `mercury.graph` Graph resulting from calling method fit() on a Graph, - where its adjacency matrix has been converted into a transition matrix. - """ def __init__(self): @@ -28,26 +23,25 @@ def fit(self, G: Graph): Converts the adjacency matrix into a transition matrix. Transition matrices are used to compute the distribution of probability of being in each of the nodes (or states) of a directed graph (or Markov process). The distribution for state s is: - * $s_t = T*s_{t-1}$ Where: - T is the transition matrix. After calling.fit(), the adjacency matrix is the transition matrix. You can use .topandas() to see it. + T is the transition matrix. After calling.fit(), the adjacency matrix is the transition matrix. You can use .to_pandas() to see it. $s_{t-1}$ is the previous state. - Note: - If created using NetworkX directly, the name of the weight must be 'weight' and must be positive. The recommended way - to create the graph is using .set_row() which will always name the weight as 'weight' but does not check the value. + What .fit() does is scaling the non-zero rows to make them sum 1 as they are probability distributions and make the zero rows + recurrent states. A recurrent state is a final state, a state whose next state is itself. - Args + Args: G (Graph): A `mercury.graph` Graph. Returns: - self (object): Fitted self (or raises an error). + (self): Fitted self (or raises an error). - What .fit() does is scaling the non-zero rows to make them sum 1 as they are probability distributions and make the zero rows - recurrent states. A recurrent state is a final state, a state whose next state is itself. + Note: + If created using NetworkX directly, the name of the weight must be 'weight' and must be positive. The recommended way + to create the graph is using .set_row() which will always name the weight as 'weight' but does not check the value. """ names = list(G.networkx.nodes) @@ -74,21 +68,19 @@ def fit(self, G: Graph): def to_pandas(self, num_iterations=1): """ - Returns the adjacency (which is the transition matrix after .fit() was called) for a given number of iterations as a pandas + Returns the adjacency (which is the transition matrix after `fit()` was called) for a given number of iterations as a pandas dataframe with labeled rows and columns. Args: num_iterations (int): If you want to compute the matrix for a different number of iterations, k, you can use this argument to - raise the matrix to any non negative integer, since: - - * $s_{t+k} = T^k*s_t$ + raise the matrix to any non negative integer, since $s_{t+k} = T^k*s_t$ Returns: (pd.DataFrame): The transition matrix for num_iterations. Note: - This method does not automatically call .fit(). This allows inspecting the adjacency matrix as a pandas dataframe. - The result of computing num_iterations will not make sense if .fit() has not been called before .to_pandas(). + This method does not automatically call `fit()`. This allows inspecting the adjacency matrix as a pandas dataframe. + The result of computing num_iterations will not make sense if `fit()` has not been called before `to_pandas()`. """ if self.fitted_graph_ is None: diff --git a/mercury/graph/viz/moebius.py b/mercury/graph/viz/moebius.py index 24a1988..25054dc 100644 --- a/mercury/graph/viz/moebius.py +++ b/mercury/graph/viz/moebius.py @@ -114,7 +114,7 @@ def node_or_edge_config(self, text_is = None, color_is = None, colors = None, si to the displayed radius. Returns: - The node configuration dictionary + (dict): The node configuration dictionary """ config = {} diff --git a/mkdocs.yml b/mkdocs.yml index c9a64c4..4ebb086 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -1,34 +1,71 @@ site_name: mercury-graph repo_url: https://github.com/BBVA/mercury-graph/ repo_name: mercury-graph + theme: name: material features: - tabs - navigation.indexes + - navigation.path + - navigation.expand + - search.suggest + - search.highlight + - search.share + - content.footnote.tooltips icon: logo: material/book-open-page-variant repo: fontawesome/brands/github -site_dir: site + palette: + - media: "(prefers-color-scheme: light)" + scheme: default + primary: custom + accent: custom + toggle: + icon: material/brightness-7 + name: Switch to dark mode + - media: "(prefers-color-scheme: dark)" + scheme: slate + primary: custom + accent: custom + toggle: + icon: material/brightness-4 + name: Switch to light mode + nav: - Home: index.md -- Api: - - base tests: reference/basetests.md - - data tests: reference/data_tests.md - - model tests: reference/model_tests.md +- API: + - mercury.graph.core: reference/core.md + - mercury.graph.ml: reference/ml.md + - mercury.graph.embeddings: reference/embeddings.md + - mercury.graph.viz: reference/viz.md + +docs_dir: docs +site_dir: site + +extra: + social: + - icon: fontawesome/brands/linkedin + link: https://es.linkedin.com/showcase/bbva-ai-factory/ + - icon: fontawesome/brands/x-twitter + link: https://x.com/bbva_aifactory + +extra_css: +- stylesheets/extra.css +extra_javascript: +- https://polyfill.io/v3/polyfill.min.js?features=es6 +- https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-mml-chtml.js + markdown_extensions: - codehilite - admonition - pymdownx.superfences - pymdownx.arithmatex: generic: true -extra_css: -- stylesheets/extra.css -extra_javascript: -- javascripts/config.js -- https://polyfill.io/v3/polyfill.min.js?features=es6 -- https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-mml-chtml.js +- footnotes + plugins: +- search - mkdocstrings: handlers: python: @@ -37,6 +74,8 @@ plugins: show_submodules: true merge_init_into_class: true docstring_style: google - watch: + +watch: - mercury/graph + dev_addr: 0.0.0.0:8080 \ No newline at end of file diff --git a/pytest.ini b/pytest.ini index 71c30aa..abae0ff 100644 --- a/pytest.ini +++ b/pytest.ini @@ -1,12 +1,11 @@ [pytest] filterwarnings = + ignore::FutureWarning ignore::UserWarning:pyspark ignore::DeprecationWarning:pyspark.sql.pandas.utils ignore::DeprecationWarning:pyspark.sql.pandas.conversion + ignore::DeprecationWarning:pyspark.ml.image ignore::DeprecationWarning:networkx.algorithms.link_analysis.pagerank_alg ignore::DeprecationWarning:networkx.linalg.graphmatrix - ignore::DeprecationWarning:pyspark.ml.image - ignore::FutureWarning:networkx.normalized_laplacian_matrix - ignore::FutureWarning:networkx.adjacency_matrix ignore::scipy.sparse.SparseEfficiencyWarning diff --git a/unit_tests/test_embeddings_graphembeddings.py b/unit_tests/test_embeddings_graphembeddings.py index 0f82765..b77ec7d 100644 --- a/unit_tests/test_embeddings_graphembeddings.py +++ b/unit_tests/test_embeddings_graphembeddings.py @@ -69,7 +69,7 @@ def test___init__(self, sample_g): ge.fit(sample_g) assert ge.node_ids == list(sample_g.networkx.nodes) - assert isinstance(ge.graph_embedding_, Embeddings) + assert isinstance(ge.embeddings_, Embeddings) def test__load(self, sample_g, manage_path_tmp_binf): """ @@ -91,8 +91,8 @@ def test__load(self, sample_g, manage_path_tmp_binf): assert ( sum( sum( - ge1.graph_embedding_.embeddings_matrix_ - != ge2.graph_embedding_.embeddings_matrix_ + ge1.embeddings_.embeddings_matrix_ + != ge2.embeddings_.embeddings_matrix_ ) ) > 0 @@ -119,8 +119,8 @@ def test_get_most_similar_nodes(self): ge = GraphEmbedding(dimension=2, n_jumps=5) # Set embedding matrix to known most similar embeddings ge.node_ids = ["A", "B", "C", "D"] - ge.graph_embedding_ = Embeddings(dimension=2) - ge.graph_embedding_.embeddings_matrix_ = np.array( + ge.embeddings_ = Embeddings(dimension=2) + ge.embeddings_.embeddings_matrix_ = np.array( [[1, 0], [0, 1], [1, 0.5], [-1, 0]] ) diff --git a/unit_tests/test_ml_spectral.py b/unit_tests/test_ml_spectral.py index ea3bdd5..1a1c409 100644 --- a/unit_tests/test_ml_spectral.py +++ b/unit_tests/test_ml_spectral.py @@ -141,4 +141,4 @@ def test_fit_spark(self): ) assert sorted(labels_["cluster"].unique()) == [0, 1, 2] - assert spectral_clustering.modularity_ > 0 + assert -0.5 <= spectral_clustering.modularity_ <= 1