From e6febdff09ebf3256218d981f2cc6deeea8749e6 Mon Sep 17 00:00:00 2001 From: Anup Kumar Date: Mon, 14 Oct 2024 15:09:49 +0000 Subject: [PATCH 1/7] add files --- tools/tabpfn/main.py | 27 ++++++++++++++++ tools/tabpfn/tabpfn.xml | 72 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 99 insertions(+) create mode 100644 tools/tabpfn/main.py create mode 100644 tools/tabpfn/tabpfn.xml diff --git a/tools/tabpfn/main.py b/tools/tabpfn/main.py new file mode 100644 index 0000000000..3035a61afa --- /dev/null +++ b/tools/tabpfn/main.py @@ -0,0 +1,27 @@ +""" +Tabular data prediction using TabPFN +""" + +import argparse + +import imageio +import numpy as np +import torch + + +def model(): + """ + + """ + print() + + +if __name__ == "__main__": + arg_parser = argparse.ArgumentParser() + #arg_parser.add_argument("-im", "--imaging_model", required=True, help="Input BioImage model") + #arg_parser.add_argument("-ii", "--image_file", required=True, help="Input image file") + #arg_parser.add_argument("-is", "--image_size", required=True, help="Input image file's size") + + # get argument values + #args = vars(arg_parser.parse_args()) + #model_path = args["imaging_model"] diff --git a/tools/tabpfn/tabpfn.xml b/tools/tabpfn/tabpfn.xml new file mode 100644 index 0000000000..8c665c6c2f --- /dev/null +++ b/tools/tabpfn/tabpfn.xml @@ -0,0 +1,72 @@ + + with PyTorch + + 2.4.0 + 0 + + + + + + + + python + pytorch + torchvision + imageio + + echo "@VERSION@" + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + config > test_information > inputs > size) + + **Output files** + - Predicted image: Predicted image using the BioImage.IO model + - Predicted image matrix: Predicted image matrix in original dimensions + ]]> + + + 10.1145/3620665.3640366 + 10.1101/2022.06.07.495102 + + From 3569e1eed34cee15fb8b8d2c709eee57b6370eff Mon Sep 17 00:00:00 2001 From: Anup Kumar Date: Fri, 18 Oct 2024 14:13:22 +0000 Subject: [PATCH 2/7] update --- tools/tabpfn/tabpfn.xml | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/tools/tabpfn/tabpfn.xml b/tools/tabpfn/tabpfn.xml index 8c665c6c2f..3317767529 100644 --- a/tools/tabpfn/tabpfn.xml +++ b/tools/tabpfn/tabpfn.xml @@ -1,7 +1,7 @@ with PyTorch - 2.4.0 + 0.1 0 @@ -10,10 +10,7 @@ - python - pytorch - torchvision - imageio + tabpfn echo "@VERSION@" From 6048605b93888fa880cda0634ed667ba0306498e Mon Sep 17 00:00:00 2001 From: Anup Kumar Date: Mon, 21 Oct 2024 15:46:34 +0000 Subject: [PATCH 3/7] update --- tools/tabpfn/main.py | 46 ++++++++++++++++++++++++++++++++--------- tools/tabpfn/tabpfn.xml | 43 +++++++++++--------------------------- 2 files changed, 48 insertions(+), 41 deletions(-) diff --git a/tools/tabpfn/main.py b/tools/tabpfn/main.py index 3035a61afa..fe7a9a3052 100644 --- a/tools/tabpfn/main.py +++ b/tools/tabpfn/main.py @@ -3,25 +3,51 @@ """ import argparse +import time -import imageio -import numpy as np +from sklearn.metrics import accuracy_score +from tabpfn import TabPFNClassifier +import pandas as pd import torch -def model(): +def separate_features_labels(data): + df = pd.read_csv(data, sep=",") + labels = df.iloc[:, -1] + features = df.iloc[:, :-1] + print(df) + print(features) + print(labels) + return features, labels + + +def train_evaluate(args): """ - + Train TabPFN """ - print() + print(args) + + tr_features, tr_labels = separate_features_labels(args["train_data"]) + te_features, te_labels = separate_features_labels(args["test_data"]) + + classifier = TabPFNClassifier(device='cpu', N_ensemble_configurations=32) + s_time = time.time() + classifier.fit(tr_features, tr_labels) + e_time = time.time() + print("Time taken by TabPFN for training: {} seconds".format(e_time - s_time)) + y_eval, p_eval = classifier.predict(te_features, return_winning_probability=True) + print('Accuracy', accuracy_score(te_labels, y_eval)) + + te_features["true_labels"] = te_labels + te_features["pred_labels"] = y_eval + te_features.to_csv("output_predicted_data.csv", sep="\t", index=None) if __name__ == "__main__": arg_parser = argparse.ArgumentParser() - #arg_parser.add_argument("-im", "--imaging_model", required=True, help="Input BioImage model") - #arg_parser.add_argument("-ii", "--image_file", required=True, help="Input image file") - #arg_parser.add_argument("-is", "--image_size", required=True, help="Input image file's size") + arg_parser.add_argument("-trdata", "--train_data", required=True, help="Train data") + arg_parser.add_argument("-tedata", "--test_data", required=True, help="Test data") # get argument values - #args = vars(arg_parser.parse_args()) - #model_path = args["imaging_model"] + args = vars(arg_parser.parse_args()) + train_evaluate(args) diff --git a/tools/tabpfn/tabpfn.xml b/tools/tabpfn/tabpfn.xml index 3317767529..6687ee1225 100644 --- a/tools/tabpfn/tabpfn.xml +++ b/tools/tabpfn/tabpfn.xml @@ -10,60 +10,41 @@ - tabpfn + tabpfn + pandas echo "@VERSION@" - - - + + - - + - - - - - - - - - - - - - - config > test_information > inputs > size) + - Training data + - Test data **Output files** - - Predicted image: Predicted image using the BioImage.IO model - - Predicted image matrix: Predicted image matrix in original dimensions + - Predicted data along with predicted labels ]]> - 10.1145/3620665.3640366 - 10.1101/2022.06.07.495102 + 10.48550/arXiv.2207.01848 From 91d25f5586ce3a4bd7e10be09a0ec168df6bd3e7 Mon Sep 17 00:00:00 2001 From: Anup Kumar Date: Tue, 22 Oct 2024 12:05:28 +0200 Subject: [PATCH 4/7] update --- tools/tabpfn/main.py | 320 +++++++++++++++++++++++++++++++++++++++- tools/tabpfn/tabpfn.xml | 1 + 2 files changed, 320 insertions(+), 1 deletion(-) diff --git a/tools/tabpfn/main.py b/tools/tabpfn/main.py index fe7a9a3052..ab4b081cc1 100644 --- a/tools/tabpfn/main.py +++ b/tools/tabpfn/main.py @@ -6,9 +6,326 @@ import time from sklearn.metrics import accuracy_score -from tabpfn import TabPFNClassifier +import matplotlib.pyplot as plt +from functools import reduce + +import numpy as np import pandas as pd import torch +from sklearn.preprocessing import LabelEncoder +from sklearn.utils._optional_dependencies import check_matplotlib_support +from sklearn.utils import _safe_indexing +from sklearn.base import is_regressor +from sklearn.utils.validation import check_is_fitted +import numpy as np + +import matplotlib.pyplot as plt +from matplotlib.colors import ListedColormap + +from tabpfn import TabPFNClassifier + + +class DecisionBoundaryDisplay: + """Decisions boundary visualization. + It is recommended to use + :func:`~sklearn.inspection.DecisionBoundaryDisplay.from_estimator` + to create a :class:`DecisionBoundaryDisplay`. All parameters are stored as + attributes. + Read more in the :ref:`User Guide `. + .. versionadded:: 1.1 + Parameters + ---------- + xx0 : ndarray of shape (grid_resolution, grid_resolution) + First output of :func:`meshgrid `. + xx1 : ndarray of shape (grid_resolution, grid_resolution) + Second output of :func:`meshgrid `. + response : ndarray of shape (grid_resolution, grid_resolution) + Values of the response function. + xlabel : str, default=None + Default label to place on x axis. + ylabel : str, default=None + Default label to place on y axis. + Attributes + ---------- + surface_ : matplotlib `QuadContourSet` or `QuadMesh` + If `plot_method` is 'contour' or 'contourf', `surface_` is a + :class:`QuadContourSet `. If + `plot_method is `pcolormesh`, `surface_` is a + :class:`QuadMesh `. + ax_ : matplotlib Axes + Axes with confusion matrix. + figure_ : matplotlib Figure + Figure containing the confusion matrix. + """ + + def __init__(self, *, xx0, xx1, response, xlabel=None, ylabel=None): + self.xx0 = xx0 + self.xx1 = xx1 + self.response = response + self.xlabel = xlabel + self.ylabel = ylabel + + def plot(self, plot_method="contourf", ax=None, xlabel=None, ylabel=None, **kwargs): + """Plot visualization. + Parameters + ---------- + plot_method : {'contourf', 'contour', 'pcolormesh'}, default='contourf' + Plotting method to call when plotting the response. Please refer + to the following matplotlib documentation for details: + :func:`contourf `, + :func:`contour `, + :func:`pcolomesh `. + ax : Matplotlib axes, default=None + Axes object to plot on. If `None`, a new figure and axes is + created. + xlabel : str, default=None + Overwrite the x-axis label. + ylabel : str, default=None + Overwrite the y-axis label. + **kwargs : dict + Additional keyword arguments to be passed to the `plot_method`. + Returns + ------- + display: :class:`~sklearn.inspection.DecisionBoundaryDisplay` + """ + check_matplotlib_support("DecisionBoundaryDisplay.plot") + import matplotlib.pyplot as plt # noqa + + if plot_method not in ("contourf", "contour", "pcolormesh"): + raise ValueError( + "plot_method must be 'contourf', 'contour', or 'pcolormesh'" + ) + + if ax is None: + _, ax = plt.subplots() + + plot_func = getattr(ax, plot_method) + self.surface_ = plot_func(self.xx0, self.xx1, self.response, **kwargs) + + if xlabel is not None or not ax.get_xlabel(): + xlabel = self.xlabel if xlabel is None else xlabel + ax.set_xlabel(xlabel) + if ylabel is not None or not ax.get_ylabel(): + ylabel = self.ylabel if ylabel is None else ylabel + ax.set_ylabel(ylabel) + + self.ax_ = ax + self.figure_ = ax.figure + plt.savefig("output_decision_boundary.png") + return self + + @classmethod + def from_estimator( + cls, + estimator, + X, + *, + grid_resolution=100, + eps=1.0, + plot_method="contourf", + response_method="auto", + xlabel=None, + ylabel=None, + ax=None, + **kwargs, + ): + """Plot decision boundary given an estimator. + Read more in the :ref:`User Guide `. + Parameters + ---------- + estimator : object + Trained estimator used to plot the decision boundary. + X : {array-like, sparse matrix, dataframe} of shape (n_samples, 2) + Input data that should be only 2-dimensional. + grid_resolution : int, default=100 + Number of grid points to use for plotting decision boundary. + Higher values will make the plot look nicer but be slower to + render. + eps : float, default=1.0 + Extends the minimum and maximum values of X for evaluating the + response function. + plot_method : {'contourf', 'contour', 'pcolormesh'}, default='contourf' + Plotting method to call when plotting the response. Please refer + to the following matplotlib documentation for details: + :func:`contourf `, + :func:`contour `, + :func:`pcolomesh `. + response_method : {'auto', 'predict_proba', 'decision_function', \ + 'predict'}, default='auto' + Specifies whether to use :term:`predict_proba`, + :term:`decision_function`, :term:`predict` as the target response. + If set to 'auto', the response method is tried in the following order: + :term:`decision_function`, :term:`predict_proba`, :term:`predict`. + For multiclass problems, :term:`predict` is selected when + `response_method="auto"`. + xlabel : str, default=None + The label used for the x-axis. If `None`, an attempt is made to + extract a label from `X` if it is a dataframe, otherwise an empty + string is used. + ylabel : str, default=None + The label used for the y-axis. If `None`, an attempt is made to + extract a label from `X` if it is a dataframe, otherwise an empty + string is used. + ax : Matplotlib axes, default=None + Axes object to plot on. If `None`, a new figure and axes is + created. + **kwargs : dict + Additional keyword arguments to be passed to the + `plot_method`. + Returns + ------- + display : :class:`~sklearn.inspection.DecisionBoundaryDisplay` + Object that stores the result. + See Also + -------- + DecisionBoundaryDisplay : Decision boundary visualization. + ConfusionMatrixDisplay.from_estimator : Plot the confusion matrix + given an estimator, the data, and the label. + ConfusionMatrixDisplay.from_predictions : Plot the confusion matrix + given the true and predicted labels. + Examples + -------- + >>> import matplotlib.pyplot as plt + >>> from sklearn.datasets import load_iris + >>> from sklearn.linear_model import LogisticRegression + >>> from sklearn.inspection import DecisionBoundaryDisplay + >>> iris = load_iris() + >>> X = iris.data[:, :2] + >>> classifier = LogisticRegression().fit(X, iris.target) + >>> disp = DecisionBoundaryDisplay.from_estimator( + ... classifier, X, response_method="predict", + ... xlabel=iris.feature_names[0], ylabel=iris.feature_names[1], + ... alpha=0.5, + ... ) + >>> disp.ax_.scatter(X[:, 0], X[:, 1], c=iris.target, edgecolor="k") + <...> + >>> plt.show() + """ + check_matplotlib_support(f"{cls.__name__}.from_estimator") + check_is_fitted(estimator) + + if not grid_resolution > 1: + raise ValueError( + "grid_resolution must be greater than 1. Got" + f" {grid_resolution} instead." + ) + + if not eps >= 0: + raise ValueError( + f"eps must be greater than or equal to 0. Got {eps} instead." + ) + + possible_plot_methods = ("contourf", "contour", "pcolormesh") + if plot_method not in possible_plot_methods: + available_methods = ", ".join(possible_plot_methods) + raise ValueError( + f"plot_method must be one of {available_methods}. " + f"Got {plot_method} instead." + ) + + x0, x1 = _safe_indexing(X, 0, axis=1), _safe_indexing(X, 1, axis=1) + + x0_min, x0_max = x0.min() - eps, x0.max() + eps + x1_min, x1_max = x1.min() - eps, x1.max() + eps + + xx0, xx1 = np.meshgrid( + np.linspace(x0_min, x0_max, grid_resolution), + np.linspace(x1_min, x1_max, grid_resolution), + ) + if hasattr(X, "iloc"): + # we need to preserve the feature names and therefore get an empty dataframe + X_grid = X.iloc[[], :].copy() + X_grid.iloc[:, 0] = xx0.ravel() + X_grid.iloc[:, 1] = xx1.ravel() + else: + X_grid = np.c_[xx0.ravel(), xx1.ravel()] + + pred_func = _check_boundary_response_method(estimator, response_method) + response = pred_func(X_grid) + + # convert classes predictions into integers + if pred_func.__name__ == "predict" and hasattr(estimator, "classes_"): + encoder = LabelEncoder() + encoder.classes_ = estimator.classes_ + response = encoder.transform(response) + + if response.ndim != 1: + if is_regressor(estimator): + raise ValueError("Multi-output regressors are not supported") + + # TODO: Support pos_label + response = response[:, 1] + + if xlabel is None: + xlabel = X.columns[0] if hasattr(X, "columns") else "" + + if ylabel is None: + ylabel = X.columns[1] if hasattr(X, "columns") else "" + + display = DecisionBoundaryDisplay( + xx0=xx0, + xx1=xx1, + response=response.reshape(xx0.shape), + xlabel=xlabel, + ylabel=ylabel, + ) + return display.plot(ax=ax, plot_method=plot_method, **kwargs) + + +def make_decision_boundaries(trained_model, tr_features, tr_labels): + # PLOTTING + fig = plt.figure(figsize=(10,10)) + ax = fig.add_subplot(111) + cm = plt.cm.RdBu + cm_bright = ListedColormap(["purple", "yellow"]) + trained_model.fit(tr_features[:, 0:2], tr_labels) + + DecisionBoundaryDisplay.from_estimator( + trained_model, tr_features[:, 0:2], alpha=0.6, ax=ax, eps=2.0, grid_resolution=25, response_method="predict_proba" + ) + ax.scatter(tr_features[:, 0], tr_features[:, 1], c=tr_labels > 0, cmap=cm_bright) + + +def _check_boundary_response_method(estimator, response_method): + """Return prediction method from the `response_method` for decision boundary. + Parameters + ---------- + estimator : object + Fitted estimator to check. + response_method : {'auto', 'predict_proba', 'decision_function', 'predict'} + Specifies whether to use :term:`predict_proba`, + :term:`decision_function`, :term:`predict` as the target response. + If set to 'auto', the response method is tried in the following order: + :term:`decision_function`, :term:`predict_proba`, :term:`predict`. + Returns + ------- + prediction_method: callable + Prediction method of estimator. + """ + has_classes = hasattr(estimator, "classes_") + + if has_classes and len(estimator.classes_) > 2: + if response_method not in {"auto", "predict"}: + msg = ( + "Multiclass classifiers are only supported when response_method is" + " 'predict' or 'auto'" + ) + raise ValueError(msg) + methods_list = ["predict"] + elif response_method == "auto": + methods_list = ["decision_function", "predict_proba", "predict"] + else: + methods_list = [response_method] + + prediction_method = [getattr(estimator, method, None) for method in methods_list] + prediction_method = reduce(lambda x, y: x or y, prediction_method) + if prediction_method is None: + raise ValueError( + f"{estimator.__class__.__name__} has none of the following attributes: " + f"{', '.join(methods_list)}." + ) + + return prediction_method def separate_features_labels(data): @@ -41,6 +358,7 @@ def train_evaluate(args): te_features["true_labels"] = te_labels te_features["pred_labels"] = y_eval te_features.to_csv("output_predicted_data.csv", sep="\t", index=None) + make_decision_boundaries(classifier, tr_features, tr_labels) if __name__ == "__main__": diff --git a/tools/tabpfn/tabpfn.xml b/tools/tabpfn/tabpfn.xml index 6687ee1225..f2c5794f44 100644 --- a/tools/tabpfn/tabpfn.xml +++ b/tools/tabpfn/tabpfn.xml @@ -27,6 +27,7 @@ + From c09e7acf7dc415e547741c9e27cbaab695dfc05a Mon Sep 17 00:00:00 2001 From: Anup Kumar Date: Tue, 22 Oct 2024 13:39:20 +0000 Subject: [PATCH 5/7] update test --- tools/tabpfn/main.py | 342 +------- tools/tabpfn/tabpfn.xml | 15 +- tools/tabpfn/test-data/local_test_rows | 838 +++++++++++++++++++ tools/tabpfn/test-data/local_train_rows | 838 +++++++++++++++++++ tools/tabpfn/test-data/output_predicted_data | 838 +++++++++++++++++++ 5 files changed, 2542 insertions(+), 329 deletions(-) create mode 100644 tools/tabpfn/test-data/local_test_rows create mode 100644 tools/tabpfn/test-data/local_train_rows create mode 100644 tools/tabpfn/test-data/output_predicted_data diff --git a/tools/tabpfn/main.py b/tools/tabpfn/main.py index ab4b081cc1..22ea6ac739 100644 --- a/tools/tabpfn/main.py +++ b/tools/tabpfn/main.py @@ -5,336 +5,20 @@ import argparse import time -from sklearn.metrics import accuracy_score import matplotlib.pyplot as plt -from functools import reduce - import numpy as np import pandas as pd import torch -from sklearn.preprocessing import LabelEncoder -from sklearn.utils._optional_dependencies import check_matplotlib_support -from sklearn.utils import _safe_indexing -from sklearn.base import is_regressor -from sklearn.utils.validation import check_is_fitted -import numpy as np - -import matplotlib.pyplot as plt -from matplotlib.colors import ListedColormap +from sklearn.metrics import accuracy_score +from sklearn.metrics import precision_recall_curve, average_precision_score from tabpfn import TabPFNClassifier -class DecisionBoundaryDisplay: - """Decisions boundary visualization. - It is recommended to use - :func:`~sklearn.inspection.DecisionBoundaryDisplay.from_estimator` - to create a :class:`DecisionBoundaryDisplay`. All parameters are stored as - attributes. - Read more in the :ref:`User Guide `. - .. versionadded:: 1.1 - Parameters - ---------- - xx0 : ndarray of shape (grid_resolution, grid_resolution) - First output of :func:`meshgrid `. - xx1 : ndarray of shape (grid_resolution, grid_resolution) - Second output of :func:`meshgrid `. - response : ndarray of shape (grid_resolution, grid_resolution) - Values of the response function. - xlabel : str, default=None - Default label to place on x axis. - ylabel : str, default=None - Default label to place on y axis. - Attributes - ---------- - surface_ : matplotlib `QuadContourSet` or `QuadMesh` - If `plot_method` is 'contour' or 'contourf', `surface_` is a - :class:`QuadContourSet `. If - `plot_method is `pcolormesh`, `surface_` is a - :class:`QuadMesh `. - ax_ : matplotlib Axes - Axes with confusion matrix. - figure_ : matplotlib Figure - Figure containing the confusion matrix. - """ - - def __init__(self, *, xx0, xx1, response, xlabel=None, ylabel=None): - self.xx0 = xx0 - self.xx1 = xx1 - self.response = response - self.xlabel = xlabel - self.ylabel = ylabel - - def plot(self, plot_method="contourf", ax=None, xlabel=None, ylabel=None, **kwargs): - """Plot visualization. - Parameters - ---------- - plot_method : {'contourf', 'contour', 'pcolormesh'}, default='contourf' - Plotting method to call when plotting the response. Please refer - to the following matplotlib documentation for details: - :func:`contourf `, - :func:`contour `, - :func:`pcolomesh `. - ax : Matplotlib axes, default=None - Axes object to plot on. If `None`, a new figure and axes is - created. - xlabel : str, default=None - Overwrite the x-axis label. - ylabel : str, default=None - Overwrite the y-axis label. - **kwargs : dict - Additional keyword arguments to be passed to the `plot_method`. - Returns - ------- - display: :class:`~sklearn.inspection.DecisionBoundaryDisplay` - """ - check_matplotlib_support("DecisionBoundaryDisplay.plot") - import matplotlib.pyplot as plt # noqa - - if plot_method not in ("contourf", "contour", "pcolormesh"): - raise ValueError( - "plot_method must be 'contourf', 'contour', or 'pcolormesh'" - ) - - if ax is None: - _, ax = plt.subplots() - - plot_func = getattr(ax, plot_method) - self.surface_ = plot_func(self.xx0, self.xx1, self.response, **kwargs) - - if xlabel is not None or not ax.get_xlabel(): - xlabel = self.xlabel if xlabel is None else xlabel - ax.set_xlabel(xlabel) - if ylabel is not None or not ax.get_ylabel(): - ylabel = self.ylabel if ylabel is None else ylabel - ax.set_ylabel(ylabel) - - self.ax_ = ax - self.figure_ = ax.figure - plt.savefig("output_decision_boundary.png") - return self - - @classmethod - def from_estimator( - cls, - estimator, - X, - *, - grid_resolution=100, - eps=1.0, - plot_method="contourf", - response_method="auto", - xlabel=None, - ylabel=None, - ax=None, - **kwargs, - ): - """Plot decision boundary given an estimator. - Read more in the :ref:`User Guide `. - Parameters - ---------- - estimator : object - Trained estimator used to plot the decision boundary. - X : {array-like, sparse matrix, dataframe} of shape (n_samples, 2) - Input data that should be only 2-dimensional. - grid_resolution : int, default=100 - Number of grid points to use for plotting decision boundary. - Higher values will make the plot look nicer but be slower to - render. - eps : float, default=1.0 - Extends the minimum and maximum values of X for evaluating the - response function. - plot_method : {'contourf', 'contour', 'pcolormesh'}, default='contourf' - Plotting method to call when plotting the response. Please refer - to the following matplotlib documentation for details: - :func:`contourf `, - :func:`contour `, - :func:`pcolomesh `. - response_method : {'auto', 'predict_proba', 'decision_function', \ - 'predict'}, default='auto' - Specifies whether to use :term:`predict_proba`, - :term:`decision_function`, :term:`predict` as the target response. - If set to 'auto', the response method is tried in the following order: - :term:`decision_function`, :term:`predict_proba`, :term:`predict`. - For multiclass problems, :term:`predict` is selected when - `response_method="auto"`. - xlabel : str, default=None - The label used for the x-axis. If `None`, an attempt is made to - extract a label from `X` if it is a dataframe, otherwise an empty - string is used. - ylabel : str, default=None - The label used for the y-axis. If `None`, an attempt is made to - extract a label from `X` if it is a dataframe, otherwise an empty - string is used. - ax : Matplotlib axes, default=None - Axes object to plot on. If `None`, a new figure and axes is - created. - **kwargs : dict - Additional keyword arguments to be passed to the - `plot_method`. - Returns - ------- - display : :class:`~sklearn.inspection.DecisionBoundaryDisplay` - Object that stores the result. - See Also - -------- - DecisionBoundaryDisplay : Decision boundary visualization. - ConfusionMatrixDisplay.from_estimator : Plot the confusion matrix - given an estimator, the data, and the label. - ConfusionMatrixDisplay.from_predictions : Plot the confusion matrix - given the true and predicted labels. - Examples - -------- - >>> import matplotlib.pyplot as plt - >>> from sklearn.datasets import load_iris - >>> from sklearn.linear_model import LogisticRegression - >>> from sklearn.inspection import DecisionBoundaryDisplay - >>> iris = load_iris() - >>> X = iris.data[:, :2] - >>> classifier = LogisticRegression().fit(X, iris.target) - >>> disp = DecisionBoundaryDisplay.from_estimator( - ... classifier, X, response_method="predict", - ... xlabel=iris.feature_names[0], ylabel=iris.feature_names[1], - ... alpha=0.5, - ... ) - >>> disp.ax_.scatter(X[:, 0], X[:, 1], c=iris.target, edgecolor="k") - <...> - >>> plt.show() - """ - check_matplotlib_support(f"{cls.__name__}.from_estimator") - check_is_fitted(estimator) - - if not grid_resolution > 1: - raise ValueError( - "grid_resolution must be greater than 1. Got" - f" {grid_resolution} instead." - ) - - if not eps >= 0: - raise ValueError( - f"eps must be greater than or equal to 0. Got {eps} instead." - ) - - possible_plot_methods = ("contourf", "contour", "pcolormesh") - if plot_method not in possible_plot_methods: - available_methods = ", ".join(possible_plot_methods) - raise ValueError( - f"plot_method must be one of {available_methods}. " - f"Got {plot_method} instead." - ) - - x0, x1 = _safe_indexing(X, 0, axis=1), _safe_indexing(X, 1, axis=1) - - x0_min, x0_max = x0.min() - eps, x0.max() + eps - x1_min, x1_max = x1.min() - eps, x1.max() + eps - - xx0, xx1 = np.meshgrid( - np.linspace(x0_min, x0_max, grid_resolution), - np.linspace(x1_min, x1_max, grid_resolution), - ) - if hasattr(X, "iloc"): - # we need to preserve the feature names and therefore get an empty dataframe - X_grid = X.iloc[[], :].copy() - X_grid.iloc[:, 0] = xx0.ravel() - X_grid.iloc[:, 1] = xx1.ravel() - else: - X_grid = np.c_[xx0.ravel(), xx1.ravel()] - - pred_func = _check_boundary_response_method(estimator, response_method) - response = pred_func(X_grid) - - # convert classes predictions into integers - if pred_func.__name__ == "predict" and hasattr(estimator, "classes_"): - encoder = LabelEncoder() - encoder.classes_ = estimator.classes_ - response = encoder.transform(response) - - if response.ndim != 1: - if is_regressor(estimator): - raise ValueError("Multi-output regressors are not supported") - - # TODO: Support pos_label - response = response[:, 1] - - if xlabel is None: - xlabel = X.columns[0] if hasattr(X, "columns") else "" - - if ylabel is None: - ylabel = X.columns[1] if hasattr(X, "columns") else "" - - display = DecisionBoundaryDisplay( - xx0=xx0, - xx1=xx1, - response=response.reshape(xx0.shape), - xlabel=xlabel, - ylabel=ylabel, - ) - return display.plot(ax=ax, plot_method=plot_method, **kwargs) - - -def make_decision_boundaries(trained_model, tr_features, tr_labels): - # PLOTTING - fig = plt.figure(figsize=(10,10)) - ax = fig.add_subplot(111) - cm = plt.cm.RdBu - cm_bright = ListedColormap(["purple", "yellow"]) - trained_model.fit(tr_features[:, 0:2], tr_labels) - - DecisionBoundaryDisplay.from_estimator( - trained_model, tr_features[:, 0:2], alpha=0.6, ax=ax, eps=2.0, grid_resolution=25, response_method="predict_proba" - ) - ax.scatter(tr_features[:, 0], tr_features[:, 1], c=tr_labels > 0, cmap=cm_bright) - - -def _check_boundary_response_method(estimator, response_method): - """Return prediction method from the `response_method` for decision boundary. - Parameters - ---------- - estimator : object - Fitted estimator to check. - response_method : {'auto', 'predict_proba', 'decision_function', 'predict'} - Specifies whether to use :term:`predict_proba`, - :term:`decision_function`, :term:`predict` as the target response. - If set to 'auto', the response method is tried in the following order: - :term:`decision_function`, :term:`predict_proba`, :term:`predict`. - Returns - ------- - prediction_method: callable - Prediction method of estimator. - """ - has_classes = hasattr(estimator, "classes_") - - if has_classes and len(estimator.classes_) > 2: - if response_method not in {"auto", "predict"}: - msg = ( - "Multiclass classifiers are only supported when response_method is" - " 'predict' or 'auto'" - ) - raise ValueError(msg) - methods_list = ["predict"] - elif response_method == "auto": - methods_list = ["decision_function", "predict_proba", "predict"] - else: - methods_list = [response_method] - - prediction_method = [getattr(estimator, method, None) for method in methods_list] - prediction_method = reduce(lambda x, y: x or y, prediction_method) - if prediction_method is None: - raise ValueError( - f"{estimator.__class__.__name__} has none of the following attributes: " - f"{', '.join(methods_list)}." - ) - - return prediction_method - - def separate_features_labels(data): df = pd.read_csv(data, sep=",") labels = df.iloc[:, -1] features = df.iloc[:, :-1] - print(df) - print(features) - print(labels) return features, labels @@ -342,11 +26,8 @@ def train_evaluate(args): """ Train TabPFN """ - print(args) - tr_features, tr_labels = separate_features_labels(args["train_data"]) te_features, te_labels = separate_features_labels(args["test_data"]) - classifier = TabPFNClassifier(device='cpu', N_ensemble_configurations=32) s_time = time.time() classifier.fit(tr_features, tr_labels) @@ -354,18 +35,25 @@ def train_evaluate(args): print("Time taken by TabPFN for training: {} seconds".format(e_time - s_time)) y_eval, p_eval = classifier.predict(te_features, return_winning_probability=True) print('Accuracy', accuracy_score(te_labels, y_eval)) - - te_features["true_labels"] = te_labels - te_features["pred_labels"] = y_eval - te_features.to_csv("output_predicted_data.csv", sep="\t", index=None) - make_decision_boundaries(classifier, tr_features, tr_labels) + pred_probas_test = classifier.predict_proba(te_features) + te_features["predicted_labels"] = y_eval + te_features.to_csv("output_predicted_data", sep="\t", index=None) + precision, recall, thresholds = precision_recall_curve(te_labels, pred_probas_test[:, 1]) + average_precision = average_precision_score(te_labels, pred_probas_test[:, 1]) + plt.figure(figsize=(8, 6)) + plt.plot(recall, precision, label=f'Precision-Recall Curve (AP={average_precision:.2f})') + plt.xlabel('Recall') + plt.ylabel('Precision') + plt.title('Precision-Recall Curve') + plt.legend(loc='lower left') + plt.grid(True) + plt.savefig("output_prec_recall_curve.png") if __name__ == "__main__": arg_parser = argparse.ArgumentParser() arg_parser.add_argument("-trdata", "--train_data", required=True, help="Train data") arg_parser.add_argument("-tedata", "--test_data", required=True, help="Test data") - # get argument values args = vars(arg_parser.parse_args()) train_evaluate(args) diff --git a/tools/tabpfn/tabpfn.xml b/tools/tabpfn/tabpfn.xml index f2c5794f44..8a549e55cc 100644 --- a/tools/tabpfn/tabpfn.xml +++ b/tools/tabpfn/tabpfn.xml @@ -12,6 +12,7 @@ tabpfn pandas + matplotlib echo "@VERSION@" @@ -26,10 +27,20 @@ - - + + + + + + + + + + + + Date: Tue, 22 Oct 2024 14:18:30 +0000 Subject: [PATCH 6/7] add .shed.yml and remove usused imports --- tools/tabpfn/.shed.yml | 13 +++++++++++++ tools/tabpfn/main.py | 3 --- 2 files changed, 13 insertions(+), 3 deletions(-) create mode 100644 tools/tabpfn/.shed.yml diff --git a/tools/tabpfn/.shed.yml b/tools/tabpfn/.shed.yml new file mode 100644 index 0000000000..19df40090b --- /dev/null +++ b/tools/tabpfn/.shed.yml @@ -0,0 +1,13 @@ +name: tabpfn +owner: bgruening +description: Tabular data prediction using TabPFN using Pytorch. +long_description: | + The TabPFN is a neural network that learned to do tabular data prediction. + This is the original CUDA-supporting pytorch impelementation. +remote_repository_url: https://github.com/bgruening/galaxytools/tree/master/tools/tabpfn +homepage_url: https://github.com/bgruening/galaxytools/tree/master/tools/tabpfn +type: +categories: + - Machine Learning +maintainers: + anuprulez diff --git a/tools/tabpfn/main.py b/tools/tabpfn/main.py index 22ea6ac739..18e65d4c28 100644 --- a/tools/tabpfn/main.py +++ b/tools/tabpfn/main.py @@ -1,14 +1,11 @@ """ Tabular data prediction using TabPFN """ - import argparse import time import matplotlib.pyplot as plt -import numpy as np import pandas as pd -import torch from sklearn.metrics import accuracy_score from sklearn.metrics import precision_recall_curve, average_precision_score From df932aec08f83224d3fc21c089f60f61ef1a0eb4 Mon Sep 17 00:00:00 2001 From: Anup Kumar Date: Tue, 22 Oct 2024 14:35:58 +0000 Subject: [PATCH 7/7] fix import order --- tools/tabpfn/main.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/tools/tabpfn/main.py b/tools/tabpfn/main.py index 18e65d4c28..13c2ddbedb 100644 --- a/tools/tabpfn/main.py +++ b/tools/tabpfn/main.py @@ -6,9 +6,7 @@ import matplotlib.pyplot as plt import pandas as pd - -from sklearn.metrics import accuracy_score -from sklearn.metrics import precision_recall_curve, average_precision_score +from sklearn.metrics import accuracy_score, average_precision_score, precision_recall_curve from tabpfn import TabPFNClassifier