Skip to content

Commit

Permalink
Merge pull request #44 from EpistasisLab/dev
Browse files Browse the repository at this point in the history
Dev
  • Loading branch information
perib authored Aug 17, 2023
2 parents e19efb6 + 9169d81 commit 05bf490
Show file tree
Hide file tree
Showing 26 changed files with 2,189 additions and 350 deletions.
2 changes: 1 addition & 1 deletion PULL_REQUEST_TEMPLATE.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@

## What are the relevant issues?

[you can link directly to issues by entering # then the number of the issue, for example, #3 links to issue 3]
[you can link directly to issues by entering # then the number of the issue]

## Screenshots (if appropriate)

Expand Down
5 changes: 5 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,11 @@ If you are interested in using the current stable release of TPOT, you can do th
Please see the [repository license](https://github.com/EpistasisLab/tpot2/blob/main/LICENSE) for the licensing and usage information for TPOT2.
Generally, we have licensed TPOT2 to make it as widely usable as possible.

## Documentation

[The documentation webpage can be found here.](epistasislab.github.io/tpot2/)

We also recommend looking at the Tutorials folder for jupyter notebooks with examples and guides.

## Installation

Expand Down
986 changes: 849 additions & 137 deletions Tutorial/1_Estimators_Overview.ipynb

Large diffs are not rendered by default.

1,022 changes: 944 additions & 78 deletions Tutorial/3_Genetic_Feature_Set_Selectors.ipynb

Large diffs are not rendered by default.

196 changes: 163 additions & 33 deletions Tutorial/4_Symbolic_Regression_and_Classification.ipynb

Large diffs are not rendered by default.

142 changes: 114 additions & 28 deletions Tutorial/7_dask_parallelization.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,11 @@
"source": [
"# Parallelization\n",
"\n",
"TPOT2 uses the Dask package for parallelization either locally (dask.destributed.LocalCluster) or multi-node via a job schedule (dask-jobqueue). \n"
"TPOT2 uses the Dask package for parallelization either locally (dask.destributed.LocalCluster) or multi-node via a job schedule (dask-jobqueue). \n",
"\n",
"To parallelize TPOT2 all you need to do is set the n_jobs parameter to the number of cores you want to use. Alternatively, users can create a custom Dask client and pass it in to TPOT2.\n",
"\n",
"This is supported the same in all of the different estimators (TPOTEstimator, TPOTEstimatorSteadyState, TPOTClassifier, or TPOTRegressor)"
]
},
{
Expand All @@ -22,9 +26,24 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 1,
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Evaluations: : 232it [02:01, 1.90it/s]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.9998431179414371\n"
]
}
],
"source": [
"#my_analysis.py\n",
"\n",
Expand All @@ -38,7 +57,7 @@
" scorer = sklearn.metrics.get_scorer('roc_auc_ovr')\n",
" X, y = sklearn.datasets.load_digits(return_X_y=True)\n",
" X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, train_size=0.75, test_size=0.25)\n",
" est = tpot2.TPOTClassifier(population_size= 8, generations=5,)\n",
" est = tpot2.TPOTEstimatorSteadyState( n_jobs=10,memory_limit=\"4GB\", classification=True, max_eval_time_seconds=60, max_time_seconds=120, scorers=['roc_auc_ovr'], scorers_weights=[1], verbose=1)\n",
" est.fit(X_train, y_train)\n",
" print(scorer(est, X_test, y_test))"
]
Expand All @@ -59,9 +78,24 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 2,
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Evaluations: : 231it [02:00, 1.92it/s]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.9998143035770981\n"
]
}
],
"source": [
"import tpot2\n",
"import sklearn\n",
Expand All @@ -72,7 +106,7 @@
"X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, train_size=0.75, test_size=0.25)\n",
"\n",
"\n",
"est = tpot2.TPOTClassifier(population_size= 8, generations=5, n_jobs=4, memory_limit=\"4GB\", verbose=1)\n",
"est = tpot2.TPOTEstimatorSteadyState( n_jobs=10,memory_limit=\"4GB\", classification=True, max_eval_time_seconds=60, max_time_seconds=120, scorers=['roc_auc_ovr'], scorers_weights=[1], verbose=1)\n",
"est.fit(X_train, y_train)\n",
"print(scorer(est, X_test, y_test))"
]
Expand All @@ -90,7 +124,11 @@
"\n",
"\n",
"[Dask Python Tutorial](https://docs.dask.org/en/stable/deploying-python.html)\n",
"[Dask Dashboard](https://docs.dask.org/en/stable/dashboard.html)"
"[Dask Dashboard](https://docs.dask.org/en/stable/dashboard.html)\n",
"\n",
"\n",
"Note that the if a client is passed in manually, TPOT will ignore n_jobs and memory_limit.\n",
"If there is no client passed in, TPOT will ignore any global/existing client and create its own."
]
},
{
Expand All @@ -103,7 +141,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -128,30 +166,54 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 4,
"metadata": {},
"outputs": [],
"outputs": [
{
"data": {
"text/plain": [
"'http://127.0.0.1:8787/status'"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"client.dashboard_link"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"Pass into TPOT to Train.\n",
"Note that the if a client is passed in manually, TPOT will ignore n_jobs and memory_limit.\n",
"If there is no client passed in, TPOT will ignore any global/existing client and create its own."
]
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 5,
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Evaluations: : 142it [02:00, 1.18it/s]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.999735780838626\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"2023-08-16 16:10:04,735 - distributed.nanny - WARNING - Worker process still alive after 3.199999694824219 seconds, killing\n",
"2023-08-16 16:10:04,735 - distributed.nanny - WARNING - Worker process still alive after 3.1999995422363288 seconds, killing\n"
]
}
],
"source": [
"est = tpot2.TPOTClassifier(population_size= 8, generations=5, client=client, verbose=1)\n",
"est = tpot2.TPOTEstimatorSteadyState( client=client, classification=True, max_eval_time_seconds=60, max_time_seconds=120, scorers=['roc_auc_ovr'], scorers_weights=[1], verbose=1)\n",
"# this is equivalent to: \n",
"# est = tpot2.TPOTClassifier(population_size= 8, generations=5, n_jobs=4, memory_limit=\"4GB\", verbose=1)\n",
"est.fit(X_train, y_train)\n",
Expand All @@ -174,9 +236,33 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 6,
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Evaluations: : 131it [02:02, 1.07it/s]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.9999114413297068\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"2023-08-16 16:12:11,659 - distributed.nanny - WARNING - Worker process still alive after 3.1999995422363288 seconds, killing\n",
"2023-08-16 16:12:11,659 - distributed.nanny - WARNING - Worker process still alive after 3.199999694824219 seconds, killing\n",
"2023-08-16 16:12:11,660 - distributed.nanny - WARNING - Worker process still alive after 3.199999694824219 seconds, killing\n"
]
}
],
"source": [
"from dask.distributed import Client, LocalCluster\n",
"import tpot2\n",
Expand All @@ -197,7 +283,7 @@
" threads_per_worker=1,\n",
" memory_limit='4GB',\n",
") as cluster, Client(cluster) as client:\n",
" est = tpot2.TPOTClassifier(population_size= 8, generations=5, client=client, verbose=1)\n",
" est = tpot2.TPOTEstimatorSteadyState(client=client, n_jobs=10,memory_limit=\"4GB\", classification=True, max_eval_time_seconds=60, max_time_seconds=120, scorers=['roc_auc_ovr'], scorers_weights=[1], verbose=1)\n",
" est.fit(X_train, y_train)\n",
" print(scorer(est, X_test, y_test))"
]
Expand Down Expand Up @@ -245,7 +331,7 @@
"X, y = sklearn.datasets.load_digits(return_X_y=True)\n",
"X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, train_size=0.75, test_size=0.25)\n",
"\n",
"est = tpot2.TPOTClassifier(population_size= 100, generations=5, client=client, verbose=1)\n",
"est = tpot2.TPOTEstimatorSteadyState( client=client, classification=True, max_eval_time_seconds=60, max_time_seconds=120, scorers=['roc_auc_ovr'], scorers_weights=[1], verbose=1)\n",
"# this is equivalent to: \n",
"# est = tpot2.TPOTClassifier(population_size= 8, generations=5, n_jobs=4, memory_limit=\"4GB\", verbose=1)\n",
"est.fit(X_train, y_train)\n",
Expand Down
6 changes: 3 additions & 3 deletions Tutorial/simple_fss.csv
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
one, 1,2,3
two, 4,5,6
three, 7,8,9
one,a,b,c
two,d,e,f
three,g,h,i
1 change: 1 addition & 0 deletions tpot2/builtin_modules/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from .one_hot_encoder import OneHotEncoder
from .column_one_hot_encoder import ColumnOneHotEncoder
from .arithmetictransformer import ArithmeticTransformer
from .arithmetictransformer import AddTransformer, mul_neg_1_Transformer, MulTransformer, SafeReciprocalTransformer, EQTransformer, NETransformer, GETransformer, GTTransformer, LETransformer, LTTransformer, MinTransformer, MaxTransformer, ZeroTransformer, OneTransformer, NTransformer
from .passthrough import Passthrough
from .imputer import ColumnSimpleImputer
from .selector_wrappers import RFE_ExtraTreesClassifier, SelectFromModel_ExtraTreesClassifier, RFE_ExtraTreesRegressor, SelectFromModel_ExtraTreesRegressor
4 changes: 2 additions & 2 deletions tpot2/builtin_modules/feature_set_selector.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,8 +41,8 @@ def __init__(self, sel_subset=None, name=None):
Parameters
----------
sel_subset: list or int
If X is a dataframe, items in sel_subset must correspond to column names
If X is a numpy array, items in sel_subset must correspond to column indexes
If X is a dataframe, items in sel_subset list must correspond to column names
If X is a numpy array, items in sel_subset list must correspond to column indexes
int: index of a single column
Returns
-------
Expand Down
17 changes: 15 additions & 2 deletions tpot2/config/special_configs.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from functools import partial
import pandas as pd
import numpy as np
from tpot2.builtin_modules import AddTransformer, mul_neg_1_Transformer, MulTransformer, SafeReciprocalTransformer, EQTransformer, NETransformer, GETransformer, GTTransformer, LETransformer, LTTransformer, MinTransformer, MaxTransformer, ZeroTransformer, OneTransformer, NTransformer

# ArithmeticTransformer
def params_arthmetic_operator(trial, name=None):
Expand All @@ -10,7 +11,20 @@ def params_arthmetic_operator(trial, name=None):
}

def make_arithmetic_transformer_config_dictionary():
return {ArithmeticTransformer: params_arthmetic_operator}
return {
AddTransformer: {},
mul_neg_1_Transformer: {},
MulTransformer: {},
SafeReciprocalTransformer: {},
EQTransformer: {},
NETransformer: {},
GETransformer: {},
GTTransformer: {},
LETransformer: {},
LTTransformer: {},
MinTransformer: {},
MaxTransformer: {},
}



Expand Down Expand Up @@ -65,7 +79,6 @@ def make_FSS_config_dictionary(subsets=None, n_features=None, feature_names=None

if isinstance(subsets, str):
df = pd.read_csv(subsets,header=None,index_col=0)
df.set_index(0,inplace=True)
df['features'] = df.apply(lambda x: list([x[c] for c in df.columns]),axis=1)
subset_dict = {}
for row in df.index:
Expand Down
4 changes: 2 additions & 2 deletions tpot2/evolvers/base_evolver.py
Original file line number Diff line number Diff line change
Expand Up @@ -390,7 +390,7 @@ def optimize(self, generations=None):
self.evaluate_population()

if len(self.population.population) == 0:
raise Exception("No individuals could be evaluated in the initial population")
raise Exception("No individuals could be evaluated in the initial population. This may indicate a bug in the configuration, included models, or objective functions. Set verbose>=4 to see the errors that caused individuals to fail.")

self.generation += 1
# Generation 1 is the first generation after the initial population
Expand Down Expand Up @@ -645,7 +645,7 @@ def get_unevaluated_individuals(self, column_names, budget=None, individual_list
def evaluate_population_selection_early_stop(self,survival_counts, thresholds=None, budget=None):


survival_selector = tpot2.parent_selectors.survival_select_NSGA2
survival_selector = tpot2.selectors.survival_select_NSGA2

################

Expand Down
Loading

0 comments on commit 05bf490

Please sign in to comment.