From cc1ff21e105846681a5c45a558db084387e7dac2 Mon Sep 17 00:00:00 2001 From: Chirag Nagpal Date: Tue, 1 Mar 2022 20:25:31 -0500 Subject: [PATCH] modified: phenotyping.py --- auton_survival/phenotyping.py | 299 ++++++++++++++++++---------------- 1 file changed, 162 insertions(+), 137 deletions(-) diff --git a/auton_survival/phenotyping.py b/auton_survival/phenotyping.py index 97bc433..5544528 100644 --- a/auton_survival/phenotyping.py +++ b/auton_survival/phenotyping.py @@ -21,7 +21,8 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. -"""Tools to identify subgroups for use in comparing survival probabilities among groups.""" +"""Utilities to phenotype individuals based on similar survival +characteristics.""" import numpy as np import pandas as pd @@ -43,9 +44,10 @@ def __init__(self, random_seed=0): self.fitted = False class IntersectionalPhenotyper(Phenotyper): - """A phenotyper that phenotypes by performing an exhaustive cartesian product on specified categorical - and numerical variables. - + + """A phenotyper that phenotypes by performing an exhaustive cartesian + product on prespecified set of categorical and numerical variables. + Parameters ----------- cat_vars : list of python str(s), default=None @@ -53,18 +55,20 @@ class IntersectionalPhenotyper(Phenotyper): num_vars : list of python str(s), default=None List of column names of continuous variables to phenotype on. num_vars_quantiles : tuple of floats, default=(0, .5, 1.0) - A tuple of quantiles as floats (inclusive of 0 and 1) used to discretize continuous variables. - into equal-sized bins. + A tuple of quantiles as floats (inclusive of 0 and 1) used to + discretize continuous variables into equal-sized bins. features : pd.DataFrame - A pandas dataframe with rows corresponding to individual samples and columns as covariates. + A pandas dataframe with rows corresponding to individual + samples and columns as covariates. phenotypes : list - List of lists containing all possible combinations of specified categorical and numerical variable values. - + List of lists containing all possible combinations of specified + categorical and numerical variable values. + """ def __init__(self, cat_vars=None, num_vars=None, num_vars_quantiles=(0, .5, 1.0)): - + if isinstance(cat_vars, str): cat_vars = [cat_vars] if isinstance(num_vars, str): num_vars = [num_vars] @@ -80,18 +84,21 @@ def __init__(self, cat_vars=None, num_vars=None, self.fitted = False def fit(self, features): - """Fit the phenotyper by finding all possible intersectional groups on the passed dataset. - - Parameters - ----------- - features : pd.DataFrame - A pandas dataframe with rows corresponding to individual samples and columns as covariates. - - Returns - ----------- - Trained instance of intersectional phenotyper. - - """ + + """Fit the phenotyper by finding all possible intersectional groups + on a passed set of features. + + Parameters + ----------- + features : pd.DataFrame + A pandas dataframe with rows corresponding to individual samples + and columns as covariates. + + Returns + ----------- + Trained instance of intersectional phenotyper. + + """ self.cut_bins = {} self.min_max = {} @@ -110,28 +117,31 @@ def fit(self, features): return self def phenotype(self, features): - """Generate phenotypes on a dataset based on groups learnt when fitting. - - Parameters - ----------- - features : pd.DataFrame - A pandas dataframe with rows corresponding to individual samples and columns as covariates. - - Returns - ----------- - np.array : A numpy array containing a list of strings that define subgroups from all possible combinations - of specified categorical and numerical variables. - - """ + + """Phenotype out of sample test data. + + Parameters + ----------- + features : pd.DataFrame + A pandas dataframe with rows corresponding to individual samples + and columns as covariates. + + Returns + -------- + np.array : A numpy array containing a list of strings that define + subgroups from all possible combinations of specified categorical + and numerical variables. + + """ assert self.fitted, "Phenotyper must be `fitted` before calling `phenotype`." features = deepcopy(features) for num_var in self.num_vars: - + var_min, var_max = self.min_max[num_var] - features[num_var][features[num_var]>=var_max] = var_max + features[num_var][features[num_var]>=var_max] = var_max features[num_var][features[num_var]<=var_min] = var_min features[num_var] = pd.cut(features[num_var], self.cut_bins[num_var], @@ -145,78 +155,85 @@ def phenotype(self, features): return phenotypes def _rename(self, phenotypes): - """Helper function to rename the phenotype names. - - Parameters - ----------- - phenotypes : list - List of lists containing all possible combinations of specified categorical and numerical variable values. - - Returns - ----------- - list : python list of a list of strings that define subgroups. - - """ + + """Helper function to clean the phenotype names. + + Parameters + ----------- + phenotypes : list + List of lists containing all possible combinations of specified + categorical and numerical variable values. + Returns + -------- + list : python list of a list of strings that define subgroups. + + """ ft_names = self.cat_vars + self.num_vars renamed = [] - for i in range(len(demographics)): + for i in range(len(phenotypes)): row = [] for j in range(len(ft_names)): - row.append(ft_names[j]+":"+str(demographics[i][j])) + row.append(ft_names[j]+":"+str(phenotypes[i][j])) renamed.append(" & ".join(row)) return renamed def fit_phenotype(self, features): - """Fit and perform phenotyping on a given dataset. - - Parameters - ----------- - features : pd.DataFrame - A pandas dataframe with rows corresponding to individual samples and columns as covariates. - - Returns - ----------- - np.array : A numpy array containing a list of strings that define subgroups from all possible combinations of specified - categorical and numerical variables. - - """ + + """Fit and perform phenotyping on a given dataset. + + Parameters + ----------- + features : pd.DataFrame + A pandas dataframe with rows corresponding to individual samples + and columns as covariates. + + Returns + ----------- + np.array : A numpy array containing a list of strings that define + subgroups from all possible combinations of specified categorical + and numerical variables. + + """ return self.fit(features).phenotype(features) class ClusteringPhenotyper(Phenotyper): - """Phenotyper that performs dimensionality reduction followed by clustering. Learned clusters are considered phenotypes - and used to group samples based on similarity in the covariate space. + + """Phenotyper that performs dimensionality reduction followed by clustering. + Learned clusters are considered phenotypes and used to group samples based + on similarity in the covariate space. Parameters ----------- features : pd.DataFrame - A pandas dataframe with rows corresponding to individual samples and columns as covariates. + A pandas dataframe with rows corresponding to individual samples + and columns as covariates. clustering_method : str, default='kmeans' The clustering method applied for phenotyping. Options include: - 'kmeans' : K-Means Clustering - 'dbscan' : Density-Based Spatial Clustering of Applications with Noise (DBSCAN) - 'gmm' : Gaussian Mixture - - 'hierarchical' : Agglomerative Clustering + - 'hierarchical' : Agglomerative Clustering dim_red_method : str, default=None The dimensionality reductions method applied. Options include: - 'pca' : Principal Component Analysis - 'kpca' : Kernel Principal Component Analysis - - 'nnmf' : Non-Negative Matrix Factorization - - None : dimensionality reduction is not applied. + - 'nnmf' : Non-Negative Matrix Factorization + - None : dimensionality reduction is not applied. random_seed : int, default=0 - Controls the randomness and reproducibility of called functions + Controls the randomness and reproducibility of called functions kwargs : dict Additional arguments for dimensionality reduction and clustering Please include dictionary key and item pairs specified by the following sci-kit learn modules: 'pca' : sklearn.decomposition.PCA 'nnmf' : sklearn.decomposition.NMF - 'kpca' : sklearn.decomposition.KernelPCA + 'kpca' : sklearn.decomposition.KernelPCA 'kmeans' : sklearn.cluster.KMeans 'dbscan' : sklearn.cluster.DBSCAN 'gmm' : sklearn.mixture.GaussianMixture 'hierarchical' : sklearn.cluster.AgglomerativeClustering - + """ _VALID_DIMRED_METHODS = ['pca', 'kpca', 'nnmf', None] @@ -229,7 +246,8 @@ def __init__(self, clustering_method = 'kmeans', dim_red_method = None, random_s # Raise warning if "hierarchical" is used with dim_redcution if (clustering_method in ['hierarchical']) and (dim_red_method is not None): - print("WARNING: Are you sure you want to run hierarchical clustering on decomposed features?. Such behaviour is atypical.") + print("WARNING: Are you sure you want to run hierarchical clustering on decomposed features?.", + "Such behaviour is atypical.") # Dimensionality Reduction Step: if dim_red_method is not None: @@ -238,13 +256,13 @@ def __init__(self, clustering_method = 'kmeans', dim_red_method = None, random_s elif dim_red_method == 'nnmf': dim_red_model = decomposition.NMF elif 'kpca' in dim_red_method: - dim_red_model = decomposition.KernelPCA + dim_red_model = decomposition.KernelPCA else: raise NotImplementedError("Dimensionality Reduction method: "+dim_red_method+ " Not Implemented.") if clustering_method == 'kmeans': - clustering_model= cluster.KMeans - elif clustering_method == 'dbscan': + clustering_model= cluster.KMeans + elif clustering_method == 'dbscan': clustering_model = cluster.DBSCAN elif clustering_method == 'gmm': clustering_model = mixture.GaussianMixture @@ -253,14 +271,14 @@ def __init__(self, clustering_method = 'kmeans', dim_red_method = None, random_s else: raise NotImplementedError("Clustering method: "+clustering_method+ " Not Implemented.") - self.clustering_method = clustering_method + self.clustering_method = clustering_method self.dim_red_method = dim_red_method c_kwargs = _get_method_kwargs(clustering_model, kwargs) - if clustering_method == 'gmm': + if clustering_method == 'gmm': if 'covariance_type' not in c_kwargs: c_kwargs['covariance_type'] = 'diag' - c_kwargs['n_components'] = c_kwargs.get('n_clusters', 3) + c_kwargs['n_components'] = c_kwargs.get('n_clusters', 3) self.clustering_model = clustering_model(**c_kwargs) if dim_red_method is not None: @@ -274,19 +292,22 @@ def __init__(self, clustering_method = 'kmeans', dim_red_method = None, random_s self.dim_red_model = dim_red_model(**d_kwargs) def fit(self, features): - """Perform dimensionality reduction and train an instance of the clustering algorithm. - - Parameters - ----------- - features : pd.DataFrame - A pandas dataframe with rows corresponding to individual samples and columns as covariates. - - Returns - ----------- - Trained instance of clustering phenotyper. - - """ - + + """Perform dimensionality reduction and train an instance + of the clustering algorithm. + + Parameters + ----------- + features : pd.DataFrame + A pandas dataframe with rows corresponding to individual + samples and columns as covariates. + + Returns + ----------- + Trained instance of clustering phenotyper. + + """ + if self.dim_red_method is not None: print("Fitting the following Dimensionality Reduction Model:\n", self.dim_red_model) self.dim_red_model = self.dim_red_model.fit(features) @@ -302,19 +323,23 @@ def fit(self, features): return self def _predict_proba_kmeans(self, features): - """Estimate the probability of belonging to a cluster by computing the distance to cluster center normalized - by the sum of distances to other clusters. - - Parameters - ----------- - features : pd.DataFrame - A pandas dataframe with rows corresponding to individual samples and columns as covariates. - - Returns - ----------- - np.array : A numpy array of probability estimates of sample association to learned subgroups. - - """ + + """Estimate the probability of belonging to a cluster by computing + the distance to cluster center normalized by the sum of distances + to other clusters. + + Parameters + ----------- + features : pd.DataFrame + A pandas dataframe with rows corresponding to individual samples + and columns as covariates. + + Returns + ----------- + np.array : A numpy array of probability estimates of sample association + to learned subgroups. + + """ #TODO:MAYBE DO THIS IN LOG SPACE? @@ -326,19 +351,23 @@ def _predict_proba_kmeans(self, features): return probs def phenotype(self, features): - """Peform dimensionality reduction, clustering, and create phenotypes based on the probability estimates - of sample association to learned clusters, or subgroups. - - Parameters - ----------- - features : pd.DataFrame - A pandas dataframe with rows corresponding to individual samples and columns as covariates. - - Returns - ----------- - np.array : A numpy array of the probability estimates of sample association to learned subgroups. - - """ + + """Peform dimensionality reduction, clustering, and create phenotypes + based on the probability estimates of sample association to learned + clusters, or subgroups. + + Parameters + ----------- + features : pd.DataFrame + A pandas dataframe with rows corresponding to individual samples + and columns as covariates. + + Returns + ----------- + np.array : A numpy array of the probability estimates of sample + association to learned subgroups. + + """ assert self.fitted, "Phenotyper must be `fitted` before calling `phenotype`." @@ -350,25 +379,21 @@ def phenotype(self, features): return self._predict_proba_kmeans(features) def fit_phenotype(self, features): - """Fit and perform phenotyping on a given dataset. - - Parameters - ----------- - features : pd.DataFrame - A pandas dataframe with rows corresponding to individual samples and columns as covariates. - - Returns - ----------- - np.array : A numpy array of the probability estimates of sample association to learned clusters. - - """ - - return self.fit(features).phenotype(features) + + """Fit and perform phenotyping on a given dataset. -class CoxMixturePhenotyper(Phenotyper): -"""Not implemented.""" + Parameters + ----------- + features : pd.DataFrame + A pandas dataframe with rows corresponding to individual samples + and columns as covariates. + + Returns + ----------- + np.array : A numpy array of the probability estimates of sample + association to learned clusters. - def __init__(self): + """ - raise NotImplementedError() + return self.fit(features).phenotype(features)