Update for October

jolespin · Nov 5, 2019 · 0182e83 · 0182e83
1 parent deb61d1
commit 0182e83
Show file tree

Hide file tree

Showing 12 changed files with 137 additions and 27 deletions.
diff --git a/.DS_Store b/.DS_Store
diff --git a/bin/run_soothsayer.py b/bin/run_soothsayer.py
diff --git a/install/.DS_Store b/install/.DS_Store
diff --git a/soothsayer/.DS_Store b/soothsayer/.DS_Store
diff --git a/soothsayer/__init__.py b/soothsayer/__init__.py
@@ -36,7 +36,7 @@
 import datetime
 __version__= "2019.10"
 #datetime.datetime.utcnow().strftime("%Y.%m")
-__version_specific__ = "2019.10.17" #datetime.datetime.utcnow().strftime("%Y.%m.%d")
+__version_specific__ = "2019.10.31" #datetime.datetime.utcnow().strftime("%Y.%m.%d")
 __author__ = "Josh L. Espinoza"
 __email__ = "[email protected], [email protected]"
 __url__ = "https://github.com/jolespin/soothsayer"

diff --git a/soothsayer/core/core.py b/soothsayer/core/core.py
@@ -1,13 +1,18 @@
 import os, sys, datetime, copy
 from collections import OrderedDict
 import pandas as pd
-from ..utils import is_path_like, is_nonstring_iterable, pd_dataframe_extend_index
+from skbio.util._decorator import experimental, stable
+
+from ..utils import is_path_like, is_nonstring_iterable, pd_dataframe_extend_index, assert_acceptable_arguments
 from ..io import read_dataframe, write_object
 
 __all__ = ["Dataset"]
 __all__ = sorted(__all__)
 
+
+
 # Dataset
+@experimental(as_of="2019.06")
 class Dataset(object):
     def __init__(self, data:pd.DataFrame, metadata_observations:pd.DataFrame=None, metadata_attributes:pd.DataFrame=None, metadata_target_field=None, name=None, description=None, obsv_type=None, attr_type=None, metric_type=None, name_initial_data=None, check_index_overlap=True, alias_metadata_observations:str="m0", alias_metadata_attributes:str="m1", **additional_fields):
         """
@@ -75,16 +80,14 @@ def __init__(self, data:pd.DataFrame, metadata_observations:pd.DataFrame=None, m
         if metadata_observations is None:
             metadata_observations = pd_dataframe_extend_index(data.index, pd.DataFrame(), axis=0)
         self.add_metadata(metadata_observations, axis="observations", metadata_target_field=metadata_target_field)
-        if self.alias_metadata_observations is not None:
-            setattr(self, str(self.alias_metadata_observations), self.metadata_observations)
+
 
 
         # Metadata attributes
         if metadata_attributes is None:
             metadata_attributes = pd_dataframe_extend_index(data.columns, pd.DataFrame(), axis=0)
         self.add_metadata(metadata_attributes, axis="attributes", metadata_target_field=None)
-        if self.alias_metadata_attributes is not None:
-            setattr(self, str(self.alias_metadata_attributes), self.metadata_attributes)
+
 
     def __repr__(self):
         class_name = str(self.__class__).split(".")[-1][:-2]
@@ -153,6 +156,9 @@ def add_metadata(self, metadata:pd.DataFrame, axis="infer", metadata_target_fiel
                 self.y_field = metadata_target_field
                 self.y = self.metadata_observations[self.y_field]
 
+            if self.alias_metadata_observations is not None:
+                setattr(self, str(self.alias_metadata_observations), self.metadata_observations)
+
         # Metadata attributes
         if axis in {"attrs", "attributes","columns", 1}:
             if self.check_index_overlap:
@@ -164,6 +170,8 @@ def add_metadata(self, metadata:pd.DataFrame, axis="infer", metadata_target_fiel
                 self.metadata_attributes = self.metadata_attributes.to_frame()
             if self.check_index_overlap:
                 self.metadata_attributes = self.metadata_attributes.loc[initial_data_attributes]
+            if self.alias_metadata_attributes is not None:
+                setattr(self, str(self.alias_metadata_attributes), self.metadata_attributes)
         return self
 
     # Add data versions
@@ -317,25 +325,27 @@ def set_default(self, name_version, observation_subset=None, attribute_subset=No
         self.columns_version = attribute_subset
         return self
 
-    # Filter dataset
-    def filter(self, func_observations=None, func_attributes=None, name_version=None):
-        """
-        Filter a datasets
-        """
-        # If no version is specified then use the default
-        if name_version is None:
-            name_version = self.X_version
-        assert name_version in self.__database__, f"Cannot find `{name_version}`.  Please add it to the datasets via `add_version`"
-        df = self.__database__[name_version]["data"]
-        # Observations
-        idx_observations = df.index
-        if func_observations is not None:
-            idx_observations = [*filter(func_observations, idx_observations)]
-        # Attributes
-        idx_attributes = df.columns
-        if func_attributes is not None:
-            idx_attributes = [*filter(func_attributes, idx_attributes)]
-        return df.loc[idx_observations, idx_attributes]
+#     # Filter dataset
+#     def filter(self, func_observations=None, func_attributes=None, name_version=None):
+#         """
+#         Filter a datasets
+
+#         #! Revisit this
+#         """
+#         # If no version is specified then use the default
+#         if name_version is None:
+#             name_version = self.X_version
+#         assert name_version in self.__database__, f"Cannot find `{name_version}`.  Please add it to the datasets via `add_version`"
+#         df = self.__database__[name_version]["data"]
+#         # Observations
+#         idx_observations = df.index
+#         if func_observations is not None:
+#             idx_observations = [*filter(func_observations, idx_observations)]
+#         # Attributes
+#         idx_attributes = df.columns
+#         if func_attributes is not None:
+#             idx_attributes = [*filter(func_attributes, idx_attributes)]
+#         return df.loc[idx_observations, idx_attributes]
 
     # Write object to file
     def to_file(self, path:str, compression="infer"):
@@ -385,5 +395,26 @@ def __iter__(self):
         for name_version, d in self.__database__.items():
             yield name_version, d["data"]
 
+    def __call__(self, field, index=None, func_filter=None, func_map=None, axis=0):
+        assert_acceptable_arguments(axis, {0,1})
+        assert not is_nonstring_iterable(field), "`field` cannot be a non-string iterable"
+        if axis == 0:
+            assert self.metadata_observations is not None
+            assert field in self.metadata_observations.columns, "`{}` not in `metadata_observations`".format(field)
+            data = self.metadata_observations[field]
+        if axis == 1:
+            assert self.metadata_attributes is not None
+            assert field in self.metadata_attributes.columns, "`{}` not in `metadata_attributes`".format(field)
+            data = self.metadata_attributes[field]
+        if index is not None:
+            data = data[index]
+        if func_filter is not None:
+            data = data[func_filter]
+        if func_map is not None:
+            data = data.map(func_map)
+
+        return data
+
+
     def copy(self):
         return copy.deepcopy(self)
diff --git a/soothsayer/io/.DS_Store b/soothsayer/io/.DS_Store
diff --git a/soothsayer/networks/networks.py b/soothsayer/networks/networks.py
@@ -28,7 +28,7 @@
 
 
 
-__all__ = ["Hive", "intramodular_connectivity", "topological_overlap_measure", "signed", "determine_soft_threshold", "TemporalNetwork", "Edge"]
+__all__ = ["Hive", "intramodular_connectivity", "topological_overlap_measure", "signed", "determine_soft_threshold","cluster_modularity", "TemporalNetwork", "Edge"]
 __all__ = sorted(__all__)
 
 # Network Edge
@@ -1030,6 +1030,69 @@ def determine_soft_threshold(similarity:pd.DataFrame, title=None, show_plot=True
                     fig.suptitle(title, fontsize=18, fontweight="bold", y=pad)
         return fig, ax, df_sft
 
+# Cluster modularity matrix
+def cluster_modularity(df:pd.DataFrame, node_type="node", iteration_type="iteration"):
+    """
+
+    n_louvain = 100
+
+    louvain = dict()
+    for rs in tqdm(range(n_louvain), "Louvain"):
+        louvain[rs] = community.best_partition(graph_unsigned, random_state=rs)
+    df = pd.DataFrame(louvain)
+
+    # df.head()
+    # 	0	1	2	3	4	5	6	7	8	9
+    # a	0	0	0	0	0	0	0	0	0	0
+    # b	1	1	1	1	1	1	1	1	1	1
+    # c	2	2	2	2	2	2	2	2	2	2
+    # d	3	3	3	3	3	3	3	3	3	3
+    # e	4	1	1	4	1	4	4	1	4	1
+
+    cluster_modularity(df).head()
+    iteration  0  1  2  3  4  5  6  7  8  9
+    node
+    (b, a)     0  0  0  0  0  0  0  0  0  0
+    (c, a)     0  0  0  0  0  0  0  0  0  0
+    (d, a)     0  0  0  0  0  0  0  0  0  0
+    (e, a)     0  0  0  0  0  0  0  0  0  0
+    (a, f)     0  0  0  0  0  0  0  0  0  0
+    """
+
+    # Adapted from @code-different:
+    # https://stackoverflow.com/questions/58566957/how-to-transform-a-dataframe-of-cluster-class-group-labels-into-a-pairwise-dataf
+
+
+    # `x` is a table of (n=nodes, p=iterations)
+    nodes = df.index
+    iterations = df.columns
+    x = df.values
+    n,p = x.shape
+
+    # `y` is an array of n tables, each having 1 row and p columns
+    y = x[:, None]
+
+    # Using numpy broadcasting, `z` contains the result of comparing each
+    # table in `y` against `x`. So the shape of `z` is (n x n x p)
+    z = x == y
+
+    # Reshaping `z` by merging the first two dimensions
+    data = z.reshape((z.shape[0] * z.shape[1], z.shape[2]))
+
+    # Redundant pairs
+    redundant_pairs = list(map(lambda node:frozenset([node]), nodes))
+
+    # Create pairwise clustering matrix
+    df_pairs = pd.DataFrame(
+        data=data,
+        index=pd.Index(list(map(frozenset, itertools.product(nodes,nodes))), name=node_type),
+        columns=pd.Index(iterations, name=iteration_type),
+        dtype=int,
+    ).drop(redundant_pairs, axis=0)
+
+
+    return df_pairs[~df_pairs.index.duplicated(keep="first")]
+
 # Temporal Networks
 class TemporalNetwork(object):
     """

diff --git a/soothsayer/r_wrappers/.DS_Store b/soothsayer/r_wrappers/.DS_Store
diff --git a/soothsayer/utils/.DS_Store b/soothsayer/utils/.DS_Store
diff --git a/soothsayer/utils/utils.py b/soothsayer/utils/utils.py
@@ -26,7 +26,8 @@
 "rgb_to_rgba", "map_colors", "infer_cmap", "infer_vmin_vmax", "infer_continuous_type", "scalarmapping_from_data", "Chromatic", "create_logfile", "determine_mode_for_logfiles",
 "is_dict", "is_rgb_like", "is_nonstring_iterable","is_dict_like", "is_color", "is_graph", "is_all_same_type", "is_number", "is_query_class","is_symmetrical", "is_in_namespace",
 "format_mpl_legend_handles", "LEGEND_KWS", "DIVERGING_KWS", "CMAP_DIVERGING","COLOR_NEGATIVE", "COLOR_POSITIVE",  "get_coords_contour", "get_coords_centroid", "get_parameters_ellipse", "add_cbar_from_data", "configure_scatter",
-"pd_series_collapse", "is_path_like", "pd_series_filter", "pd_dataframe_matmul", "pd_series_to_groupby_to_dataframe","pd_dataframe_query","contains","consecutive_replace", "force_symmetry","range_like","generate_random_sequence","fragment","pd_dataframe_extend_index","is_file_like","get_iris_data","assert_acceptable_arguments","filter_compositional","is_function","Command","get_directory_size","DisplayablePath","join_as_strings",
+"pd_series_collapse", "is_path_like", "pd_series_filter", "pd_dataframe_matmul", "pd_series_to_groupby_to_dataframe","pd_dataframe_query","pd_dropduplicates_index", "contains","consecutive_replace", "force_symmetry","range_like","generate_random_sequence","fragment","pd_dataframe_extend_index","is_file_like","get_iris_data","assert_acceptable_arguments","filter_compositional","is_function","Command","get_directory_size","DisplayablePath","join_as_strings",
+"get_repr",
 ]
 __all__ = sorted(__all__)
 
@@ -323,6 +324,13 @@ def consecutive_replace(x:str, *patterns):
         x = x.replace(a,b)
     return x
 
+# Get repr for custom classes
+def get_repr(class_name, instance_name=None, *args):
+    header = "{}(name = {})".format(class_name, instance_name)
+    info = format_header(header)
+    for field in args:
+        info += "\n\t* {}".format(field)
+    return info
 # ============
 # Dictionaries
 # ============
@@ -1098,6 +1106,14 @@ def pd_dataframe_extend_index(index_extended, df=None, fill=np.nan, axis=0):
         A[:] = np.nan
         return pd.concat([df, pd.DataFrame(A, index=df.index, columns=idx_extend)]).fillna(fill)
 
+# Drop duplicates index
+def pd_dropduplicates_index(data, keep="first", axis=0):
+    if axis in {0, None}:
+        return data[~data.index.duplicated(keep=keep)]
+    if axis == 1:
+        data = data.T
+        return data[~data.index.duplicated(keep=keep)].T
+
 # =======
 # Filters
 # =======

diff --git a/tutorials/.DS_Store b/tutorials/.DS_Store