Merge pull request #44 from EpistasisLab/dev

Dev
EpistasisLab · Aug 17, 2023 · 05bf490 · 05bf490
2 parents e19efb6 + 9169d81
commit 05bf490
Show file tree

Hide file tree

Showing 26 changed files with 2,189 additions and 350 deletions.
diff --git a/PULL_REQUEST_TEMPLATE.md b/PULL_REQUEST_TEMPLATE.md
@@ -18,7 +18,7 @@
 
 ## What are the relevant issues?
 
-[you can link directly to issues by entering # then the number of the issue, for example, #3 links to issue 3]
+[you can link directly to issues by entering # then the number of the issue]
 
 ## Screenshots (if appropriate)
 

diff --git a/README.md b/README.md
@@ -17,6 +17,11 @@ If you are interested in using the current stable release of TPOT, you can do th
 Please see the [repository license](https://github.com/EpistasisLab/tpot2/blob/main/LICENSE) for the licensing and usage information for TPOT2.
 Generally, we have licensed TPOT2 to make it as widely usable as possible.
 
+## Documentation
+
+[The documentation webpage can be found here.](epistasislab.github.io/tpot2/)
+
+We also recommend looking at the Tutorials folder for jupyter notebooks with examples and guides.
 
 ## Installation
 

diff --git a/Tutorial/1_Estimators_Overview.ipynb b/Tutorial/1_Estimators_Overview.ipynb
diff --git a/Tutorial/3_Genetic_Feature_Set_Selectors.ipynb b/Tutorial/3_Genetic_Feature_Set_Selectors.ipynb
diff --git a/Tutorial/4_Symbolic_Regression_and_Classification.ipynb b/Tutorial/4_Symbolic_Regression_and_Classification.ipynb
diff --git a/Tutorial/7_dask_parallelization.ipynb b/Tutorial/7_dask_parallelization.ipynb
@@ -7,7 +7,11 @@
    "source": [
     "# Parallelization\n",
     "\n",
-    "TPOT2 uses the Dask package for parallelization either locally (dask.destributed.LocalCluster) or multi-node via a job schedule (dask-jobqueue). \n"
+    "TPOT2 uses the Dask package for parallelization either locally (dask.destributed.LocalCluster) or multi-node via a job schedule (dask-jobqueue). \n",
+    "\n",
+    "To parallelize TPOT2 all you need to do is set the n_jobs parameter to the number of cores you want to use. Alternatively, users can create a custom Dask client and pass it in to TPOT2.\n",
+    "\n",
+    "This is supported the same in all of the different estimators (TPOTEstimator, TPOTEstimatorSteadyState, TPOTClassifier, or TPOTRegressor)"
    ]
   },
   {
@@ -22,9 +26,24 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 1,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Evaluations: : 232it [02:01,  1.90it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "0.9998431179414371\n"
+     ]
+    }
+   ],
    "source": [
     "#my_analysis.py\n",
     "\n",
@@ -38,7 +57,7 @@
     "    scorer = sklearn.metrics.get_scorer('roc_auc_ovr')\n",
     "    X, y = sklearn.datasets.load_digits(return_X_y=True)\n",
     "    X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, train_size=0.75, test_size=0.25)\n",
-    "    est = tpot2.TPOTClassifier(population_size= 8, generations=5,)\n",
+    "    est = tpot2.TPOTEstimatorSteadyState( n_jobs=10,memory_limit=\"4GB\", classification=True, max_eval_time_seconds=60, max_time_seconds=120, scorers=['roc_auc_ovr'], scorers_weights=[1], verbose=1)\n",
     "    est.fit(X_train, y_train)\n",
     "    print(scorer(est, X_test, y_test))"
    ]
@@ -59,9 +78,24 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 2,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Evaluations: : 231it [02:00,  1.92it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "0.9998143035770981\n"
+     ]
+    }
+   ],
    "source": [
     "import tpot2\n",
     "import sklearn\n",
@@ -72,7 +106,7 @@
     "X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, train_size=0.75, test_size=0.25)\n",
     "\n",
     "\n",
-    "est = tpot2.TPOTClassifier(population_size= 8, generations=5, n_jobs=4, memory_limit=\"4GB\", verbose=1)\n",
+    "est = tpot2.TPOTEstimatorSteadyState( n_jobs=10,memory_limit=\"4GB\", classification=True, max_eval_time_seconds=60, max_time_seconds=120, scorers=['roc_auc_ovr'], scorers_weights=[1], verbose=1)\n",
     "est.fit(X_train, y_train)\n",
     "print(scorer(est, X_test, y_test))"
    ]
@@ -90,7 +124,11 @@
     "\n",
     "\n",
     "[Dask Python Tutorial](https://docs.dask.org/en/stable/deploying-python.html)\n",
-    "[Dask Dashboard](https://docs.dask.org/en/stable/dashboard.html)"
+    "[Dask Dashboard](https://docs.dask.org/en/stable/dashboard.html)\n",
+    "\n",
+    "\n",
+    "Note that the if a client is passed in manually, TPOT will ignore n_jobs and memory_limit.\n",
+    "If there is no client passed in, TPOT will ignore any global/existing client and create its own."
    ]
   },
   {
@@ -103,7 +141,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 3,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -128,30 +166,54 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 4,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'http://127.0.0.1:8787/status'"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
     "client.dashboard_link"
    ]
   },
-  {
-   "attachments": {},
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Pass into TPOT to Train.\n",
-    "Note that the if a client is passed in manually, TPOT will ignore n_jobs and memory_limit.\n",
-    "If there is no client passed in, TPOT will ignore any global/existing client and create its own."
-   ]
-  },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 5,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Evaluations: : 142it [02:00,  1.18it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "0.999735780838626\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2023-08-16 16:10:04,735 - distributed.nanny - WARNING - Worker process still alive after 3.199999694824219 seconds, killing\n",
+      "2023-08-16 16:10:04,735 - distributed.nanny - WARNING - Worker process still alive after 3.1999995422363288 seconds, killing\n"
+     ]
+    }
+   ],
    "source": [
-    "est = tpot2.TPOTClassifier(population_size= 8, generations=5, client=client, verbose=1)\n",
+    "est = tpot2.TPOTEstimatorSteadyState( client=client, classification=True, max_eval_time_seconds=60, max_time_seconds=120, scorers=['roc_auc_ovr'], scorers_weights=[1], verbose=1)\n",
     "# this is equivalent to: \n",
     "# est = tpot2.TPOTClassifier(population_size= 8, generations=5, n_jobs=4, memory_limit=\"4GB\", verbose=1)\n",
     "est.fit(X_train, y_train)\n",
@@ -174,9 +236,33 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 6,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Evaluations: : 131it [02:02,  1.07it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "0.9999114413297068\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2023-08-16 16:12:11,659 - distributed.nanny - WARNING - Worker process still alive after 3.1999995422363288 seconds, killing\n",
+      "2023-08-16 16:12:11,659 - distributed.nanny - WARNING - Worker process still alive after 3.199999694824219 seconds, killing\n",
+      "2023-08-16 16:12:11,660 - distributed.nanny - WARNING - Worker process still alive after 3.199999694824219 seconds, killing\n"
+     ]
+    }
+   ],
    "source": [
     "from dask.distributed import Client, LocalCluster\n",
     "import tpot2\n",
@@ -197,7 +283,7 @@
     "    threads_per_worker=1,\n",
     "    memory_limit='4GB',\n",
     ") as cluster, Client(cluster) as client:\n",
-    "    est = tpot2.TPOTClassifier(population_size= 8, generations=5, client=client, verbose=1)\n",
+    "    est = tpot2.TPOTEstimatorSteadyState(client=client, n_jobs=10,memory_limit=\"4GB\", classification=True, max_eval_time_seconds=60, max_time_seconds=120, scorers=['roc_auc_ovr'], scorers_weights=[1], verbose=1)\n",
     "    est.fit(X_train, y_train)\n",
     "    print(scorer(est, X_test, y_test))"
    ]
@@ -245,7 +331,7 @@
     "X, y = sklearn.datasets.load_digits(return_X_y=True)\n",
     "X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, train_size=0.75, test_size=0.25)\n",
     "\n",
-    "est = tpot2.TPOTClassifier(population_size= 100, generations=5, client=client, verbose=1)\n",
+    "est = tpot2.TPOTEstimatorSteadyState( client=client, classification=True, max_eval_time_seconds=60, max_time_seconds=120, scorers=['roc_auc_ovr'], scorers_weights=[1], verbose=1)\n",
     "# this is equivalent to: \n",
     "# est = tpot2.TPOTClassifier(population_size= 8, generations=5, n_jobs=4, memory_limit=\"4GB\", verbose=1)\n",
     "est.fit(X_train, y_train)\n",

diff --git a/Tutorial/simple_fss.csv b/Tutorial/simple_fss.csv
@@ -1,3 +1,3 @@
-one, 1,2,3
-two, 4,5,6
-three, 7,8,9
+one,a,b,c
+two,d,e,f
+three,g,h,i
diff --git a/tpot2/builtin_modules/__init__.py b/tpot2/builtin_modules/__init__.py
@@ -3,6 +3,7 @@
 from .one_hot_encoder import OneHotEncoder
 from .column_one_hot_encoder import ColumnOneHotEncoder
 from .arithmetictransformer import ArithmeticTransformer
+from .arithmetictransformer import AddTransformer, mul_neg_1_Transformer, MulTransformer, SafeReciprocalTransformer, EQTransformer, NETransformer, GETransformer, GTTransformer, LETransformer, LTTransformer, MinTransformer, MaxTransformer, ZeroTransformer, OneTransformer, NTransformer
 from .passthrough import Passthrough
 from .imputer import ColumnSimpleImputer
 from .selector_wrappers import RFE_ExtraTreesClassifier, SelectFromModel_ExtraTreesClassifier, RFE_ExtraTreesRegressor, SelectFromModel_ExtraTreesRegressor
diff --git a/tpot2/builtin_modules/feature_set_selector.py b/tpot2/builtin_modules/feature_set_selector.py
@@ -41,8 +41,8 @@ def __init__(self, sel_subset=None, name=None):
         Parameters
         ----------
         sel_subset: list or int
-            If X is a dataframe, items in sel_subset must correspond to column names
-            If X is a numpy array, items in sel_subset must correspond to column indexes
+            If X is a dataframe, items in sel_subset list must correspond to column names
+            If X is a numpy array, items in sel_subset list must correspond to column indexes
             int: index of a single column
         Returns
         -------

diff --git a/tpot2/config/special_configs.py b/tpot2/config/special_configs.py
@@ -2,6 +2,7 @@
 from functools import partial
 import pandas as pd
 import numpy as np
+from tpot2.builtin_modules import AddTransformer, mul_neg_1_Transformer, MulTransformer, SafeReciprocalTransformer, EQTransformer, NETransformer, GETransformer, GTTransformer, LETransformer, LTTransformer, MinTransformer, MaxTransformer, ZeroTransformer, OneTransformer, NTransformer
 
 # ArithmeticTransformer
 def params_arthmetic_operator(trial, name=None):
@@ -10,7 +11,20 @@ def params_arthmetic_operator(trial, name=None):
                 }
 
 def make_arithmetic_transformer_config_dictionary():
-        return {ArithmeticTransformer: params_arthmetic_operator}
+        return  {   
+                AddTransformer: {},
+                mul_neg_1_Transformer: {}, 
+                MulTransformer: {},
+                SafeReciprocalTransformer: {},
+                EQTransformer: {},
+                NETransformer: {},
+                GETransformer: {},
+                GTTransformer: {},
+                LETransformer: {},
+                LTTransformer: {},
+                MinTransformer: {}, 
+                MaxTransformer: {},
+        }
 
 
 
@@ -65,7 +79,6 @@ def make_FSS_config_dictionary(subsets=None, n_features=None, feature_names=None
 
     if isinstance(subsets, str):
         df = pd.read_csv(subsets,header=None,index_col=0)
-        df.set_index(0,inplace=True)
         df['features'] = df.apply(lambda x: list([x[c] for c in df.columns]),axis=1) 
         subset_dict = {}
         for row in df.index:

diff --git a/tpot2/evolvers/base_evolver.py b/tpot2/evolvers/base_evolver.py
@@ -390,7 +390,7 @@ def optimize(self, generations=None):
                         self.evaluate_population()
 
                     if len(self.population.population) == 0:
-                        raise Exception("No individuals could be evaluated in the initial population")
+                        raise Exception("No individuals could be evaluated in the initial population. This may indicate a bug in the configuration, included models, or objective functions. Set verbose>=4 to see the errors that caused individuals to fail.")
 
                     self.generation += 1
                 # Generation 1 is the first generation after the initial population
@@ -645,7 +645,7 @@ def get_unevaluated_individuals(self, column_names, budget=None, individual_list
     def evaluate_population_selection_early_stop(self,survival_counts, thresholds=None, budget=None):
 
 
-        survival_selector = tpot2.parent_selectors.survival_select_NSGA2
+        survival_selector = tpot2.selectors.survival_select_NSGA2
 
         ################