read_from_clipboard

jolespin · Nov 28, 2019 · 3fd3318 · 3fd3318
1 parent 803b5f4
commit 3fd3318
Show file tree

Hide file tree

Showing 13 changed files with 81 additions and 4 deletions.
diff --git a/.DS_Store b/.DS_Store
diff --git a/bin/Icon b/bin/Icon
diff --git a/bin/run_soothsayer.py b/bin/run_soothsayer.py
diff --git a/install/.DS_Store b/install/.DS_Store
diff --git a/soothsayer/.DS_Store b/soothsayer/.DS_Store
diff --git a/soothsayer/__init__.py b/soothsayer/__init__.py
@@ -36,7 +36,7 @@
 import datetime
 __version__= "2019.11"
 #datetime.datetime.utcnow().strftime("%Y.%m")
-__version_specific__ = "2019.11.05" #datetime.datetime.utcnow().strftime("%Y.%m.%d")
+__version_specific__ = "2019.11.26" #datetime.datetime.utcnow().strftime("%Y.%m.%d")
 __author__ = "Josh L. Espinoza"
 __email__ = "[email protected], [email protected]"
 __url__ = "https://github.com/jolespin/soothsayer"

diff --git a/soothsayer/feature_extraction/feature_extraction.py b/soothsayer/feature_extraction/feature_extraction.py
@@ -1,8 +1,13 @@
 from .algorithms.clairvoyant import *
-from ..io import read_textfile
+from ..io import read_textfile, read_dataframe
+from ..utils import is_path_like, assert_acceptable_arguments
+from sklearn.linear_model import LogisticRegression
+from sklearn.tree import DecisionTreeClassifier
+import warnings
+import pandas as pd
 
 _algorithms = ["Clairvoyant"]
-_utils = ["get_trace_from_algorithm_log"]
+_utils = ["get_trace_from_algorithm_log", "get_best_model_from_algorithm"]
 __all__ = _algorithms + _utils
 __all__ = sorted(__all__)
 
@@ -24,3 +29,61 @@ def get_trace_from_algorithm_log(path:str, algorithm:str="Clairvoyance"):
                 accuracy = float(line.split("Accuracy=")[1].split("\t")[0])
                 trace.append(accuracy)
     return {"baseline":baseline, "trace":trace}
+
+
+
+# Get best model from feature selection
+def get_best_model_from_algorithm(synopsis:pd.DataFrame, model_type="infer", prefer="logistic", less_features_is_better=True, copy_synopsis=True, into=pd.Series, algorithm="clairvoyance"):
+    multiple_hits_message = "Multiple instances with best accuracy, lowest sem, and number of features.  Choosing first option."
+    assert algorithm == "clairvoyance", "Currently, `clairvoyance` is the only supported algorithm"
+    if is_path_like(synopsis):
+        df_synopsis = read_dataframe(synopsis, evaluate_columns=["hyperparameters"] )
+        for id_feature_field in filter(lambda x:x.endswith("_set"), df_synopsis.columns):
+            df_synopsis[id_feature_field] = df_synopsis[id_feature_field].map(eval)
+            try:
+                df_synopsis[id_feature_field] = df_synopsis[id_feature_field].map(lambda x:list(map(eval, x)))
+            except NameError:
+                pass
+    else:
+        df_synopsis = synopsis
+        id_feature_field = list(filter(lambda x:x.endswith("_set"), df_synopsis.columns))[0]
+
+    # Sort the synopsis
+    feature_type = "_".join(id_feature_field.split("_")[:-1])
+    df_synopsis = df_synopsis.sort_values(["accuracy", "sem", "num_{}_included".format(feature_type)], ascending=[False, True, less_features_is_better])
+
+    # Infer best model_type
+    if model_type == "infer":
+        max_accuracy = df_synopsis["accuracy"].max()
+        idx_with_max_accuracy = df_synopsis["accuracy"][lambda x: x == max_accuracy].index
+        if len(idx_with_max_accuracy) == 1:
+            model_type = df_synopsis.loc[idx_with_max_accuracy[0],"model_type"]
+        else:
+            model_types_with_best_accuracy = df_synopsis.loc[idx_with_max_accuracy,"model_type"].unique()
+            if len(model_types_with_best_accuracy) == 2:
+                model_type = prefer
+            else:
+                warnings.warn(multiple_hits_message)
+                model_type = model_types_with_best_accuracy[0]
+
+    # Subset model of interest
+    df_synopsis = df_synopsis.query("model_type == '{}'".format(model_type))
+    max_accuracy = df_synopsis["accuracy"].max()
+    idx_with_max_accuracy = df_synopsis["accuracy"][lambda x: x == max_accuracy].index
+    if len(idx_with_max_accuracy) > 1:
+        warnings.warn(multiple_hits_message)
+    # Best model
+    best_model_info = df_synopsis.loc[idx_with_max_accuracy[0]]
+    ModelClass = {
+        "logistic":LogisticRegression,
+        "tree":DecisionTreeClassifier}[model_type]
+    output_info = {
+        "clf":ModelClass(**best_model_info["hyperparameters"],
+                           random_state=best_model_info["random_state"]),
+        "hyperparameters":best_model_info["hyperparameters"],
+        "features":best_model_info[id_feature_field],
+        **best_model_info[["accuracy", "sem", "delta"]],
+    }
+    if copy_synopsis:
+        output_info["synopsis"] = df_synopsis
+    return into(output_info)
diff --git a/soothsayer/io/.DS_Store b/soothsayer/io/.DS_Store
diff --git a/soothsayer/r_wrappers/.DS_Store b/soothsayer/r_wrappers/.DS_Store
diff --git a/soothsayer/utils/.DS_Store b/soothsayer/utils/.DS_Store
diff --git a/soothsayer/utils/utils.py b/soothsayer/utils/utils.py
@@ -27,7 +27,7 @@
 "is_dict", "is_rgb_like", "is_nonstring_iterable","is_dict_like", "is_color", "is_graph", "is_all_same_type", "is_number", "is_query_class","is_symmetrical", "is_in_namespace",
 "format_mpl_legend_handles", "LEGEND_KWS", "DIVERGING_KWS", "CMAP_DIVERGING","COLOR_NEGATIVE", "COLOR_POSITIVE",  "get_coords_contour", "get_coords_centroid", "get_parameters_ellipse", "add_cbar_from_data", "configure_scatter",
 "pd_series_collapse", "is_path_like", "pd_series_filter", "pd_dataframe_matmul", "pd_series_to_groupby_to_dataframe","pd_dataframe_query","pd_dropduplicates_index", "contains","consecutive_replace", "force_symmetry","range_like","generate_random_sequence","fragment","pd_dataframe_extend_index","is_file_like","get_iris_data","assert_acceptable_arguments","filter_compositional","is_function","Command","get_directory_size","DisplayablePath","join_as_strings",
-"get_repr",
+"get_repr","read_from_clipboard",
 ]
 __all__ = sorted(__all__)
 
@@ -39,6 +39,20 @@
 DIVERGING_KWS = dict(h_neg=220, h_pos=15, sep=20, s=90, l=50)
 CMAP_DIVERGING = sns.diverging_palette(**DIVERGING_KWS, as_cmap=True)
 COLOR_NEGATIVE, COLOR_POSITIVE = sns.diverging_palette(**DIVERGING_KWS, n=2).as_hex()
+# =========
+# Clipboard
+# =========
+def read_from_clipboard(sep="\n", into=list):
+    data = pd.io.clipboard.clipboard_get()
+    if sep is not None:
+        return into(filter(bool,
+                           map(lambda x:x.strip(),
+                               data.split(sep)
+                           )
+                   )
+               )
+    else:
+        return data
 
 # ===========
 # Assertions

diff --git a/standalone/Icon b/standalone/Icon
diff --git a/tutorials/.DS_Store b/tutorials/.DS_Store