Skip to content

Commit

Permalink
Merge pull request #24 from KrishnaswamyLab/dev
Browse files Browse the repository at this point in the history
Upgrade to version 0.2.4: accepts affinity and distance matrices
  • Loading branch information
scottgigante authored May 14, 2018
2 parents 9804d9d + d9eebe0 commit ffb3c24
Show file tree
Hide file tree
Showing 6 changed files with 224 additions and 170 deletions.
115 changes: 80 additions & 35 deletions Python/phate/phate.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,9 @@ def calculate_kernel(data, k=15, a=10, alpha_decay=True, knn_dist='euclidean',
Parameters
----------
data : array-like [n_samples, n_dimensions]
2 dimensional input data array with n cells and p dimensions
2 dimensional input data array with n cells and p dimensions If
`knn_dist` is 'precomputed', `data` should be a n_samples x n_samples
distance matrix
k : int, optional, default: 15
used to set epsilon while autotuning kernel bandwidth
Expand All @@ -51,9 +53,10 @@ def calculate_kernel(data, k=15, a=10, alpha_decay=True, knn_dist='euclidean',
If true, use the alpha decaying kernel
knn_dist : string, optional, default: 'euclidean'
recommended values: 'euclidean' and 'cosine'
Any metric from scipy.spatial.distance can be used
distance metric for building kNN graph
recommended values: 'euclidean', 'cosine', 'precomputed'
Any metric from `scipy.spatial.distance` can be used
distance metric for building kNN graph. If 'precomputed',
`data` should be an n_samples x n_samples distance matrix
verbose : boolean, optional, default: True
If true, print status messages
Expand All @@ -79,9 +82,7 @@ def calculate_kernel(data, k=15, a=10, alpha_decay=True, knn_dist='euclidean',
kernel : array-like [n_samples, n_samples]
kernel matrix built from the input data
"""
precomputed = isinstance(knn_dist, list) or \
isinstance(knn_dist, np.ndarray)
if not precomputed and ndim < data.shape[1]:
if knn_dist != 'precomputed' and ndim < data.shape[1]:
if verbose:
print("Calculating PCA...")
start = time.time()
Expand All @@ -102,10 +103,10 @@ def calculate_kernel(data, k=15, a=10, alpha_decay=True, knn_dist='euclidean',
# kernel includes self as connection but not in k
# actually search for k+1 neighbors including self
k = k + 1
if alpha_decay:
if alpha_decay and a is not None:
try:
if precomputed:
pdx = knn_dist
if knn_dist == 'precomputed':
pdx = data
else:
pdx = squareform(pdist(data, metric=knn_dist))
knn_dist = np.partition(pdx, k, axis=1)[:, :k]
Expand All @@ -118,7 +119,8 @@ def calculate_kernel(data, k=15, a=10, alpha_decay=True, knn_dist='euclidean',
'Try removing duplicates.')
kernel = np.exp(-1 * (pdx ** a)) # not really Gaussian kernel
else:
if precomputed:
if knn_dist == 'precomputed':
# we already have pairwise distances
pdx = knn_dist
knn_idx = np.argpartition(pdx, k, axis=1)[:, :k]
ind_ptr = np.arange(knn_idx.shape[0] + 1) * knn_idx.shape[1]
Expand Down Expand Up @@ -184,7 +186,8 @@ def calculate_landmark_operator(kernel, n_landmark=2000,
n_components=n_svd,
random_state=random_state)
if verbose:
print("SVD complete in {:.2f} seconds".format(time.time() - start))
print("Calculated SVD in {:.2f} seconds".format(
time.time() - start))
start = time.time()
print("Calculating Kmeans...")
kmeans = MiniBatchKMeans(n_landmark,
Expand All @@ -194,7 +197,8 @@ def calculate_landmark_operator(kernel, n_landmark=2000,
clusters = kmeans.fit_predict(np.matmul(U, np.diagflat(S)))
landmarks = np.unique(clusters)
if verbose:
print("Complete in ", time.time() - start)
print("Calculated Kmeans in {:.2f} seconds".format(
time.time() - start))

# transition matrices
if is_sparse:
Expand Down Expand Up @@ -226,7 +230,9 @@ def calculate_operator(data, k=15, a=10, alpha_decay=True, n_landmark=2000,
Parameters
----------
data : array-like [n_samples, n_dimensions]
2 dimensional input data array with n cells and p dimensions
2 dimensional input data array with n cells and p dimensions. If
`knn_dist` is 'precomputed', `data` should be a n_samples x n_samples
distance or affinity matrix
k : int, optional, default: 15
used to set epsilon while autotuning kernel bandwidth
Expand All @@ -241,9 +247,11 @@ def calculate_operator(data, k=15, a=10, alpha_decay=True, n_landmark=2000,
number of landmarks to use in fast PHATE
knn_dist : string, optional, default: 'euclidean'
recommended values: 'euclidean' and 'cosine'
Any metric from scipy.spatial.distance can be used
distance metric for building kNN graph
recommended values: 'euclidean', 'cosine', 'precomputed'
Any metric from `scipy.spatial.distance` can be used
distance metric for building kNN graph. If 'precomputed',
`data` should be an n_samples x n_samples distance or
affinity matrix
diff_op : array-like, optional shape=[n_samples, n_samples], default: None
Precomputed diffusion operator
Expand Down Expand Up @@ -296,12 +304,26 @@ def calculate_operator(data, k=15, a=10, alpha_decay=True, n_landmark=2000,
if diff_op is None:
if verbose:
print("Building kNN graph and diffusion operator...")
kernel = calculate_kernel(data, a=a, k=k, knn_dist=knn_dist,
ndim=n_pca,
alpha_decay=alpha_decay,
random_state=random_state,
n_jobs=n_jobs,
verbose=verbose)
if knn_dist == 'precomputed' and np.all(np.diagonal(data) != 0):
print("Using precomputed affinity matrix...")
kernel = data
else:
if knn_dist == 'precomputed':
if np.all(np.diagonal(data) == 0):
print("Using precomputed distance matrix...")
else:
raise ValueError(
"Cannot determine precomputed data type. "
"Precomputed affinity matrices should have "
"only non-zero entries on the diagonal, and"
" precomputed distance matrices should have"
" only zero entries on the diagonal.")
kernel = calculate_kernel(data, a=a, k=k, knn_dist=knn_dist,
ndim=n_pca,
alpha_decay=alpha_decay,
random_state=random_state,
n_jobs=n_jobs,
verbose=verbose)
diff_op, landmark_transitions = calculate_landmark_operator(
kernel, n_landmark=n_landmark,
random_state=random_state, verbose=verbose)
Expand Down Expand Up @@ -382,17 +404,32 @@ def embed_mds(diff_op, t=30, n_components=2, diff_potential=None,

X = np.linalg.matrix_power(diff_op, t) # diffused diffusion operator

if potential_method == 'log':
if potential_method == 'log': # or potential_method == 1:
# handling small values
# X[X <= np.finfo(float).eps] = np.finfo(
# float).eps
X = X + 1e-3
X = X + 1e-7
diff_potential = -1 * np.log(X) # diffusion potential
elif potential_method == 'sqrt':
diff_potential = np.sqrt(X) # diffusion potential
else:
raise ValueError("Allowable 'potential_method' values: 'log' or "
"'sqrt'. '%s' was passed." % (potential_method))
else: # if isinstance(potential_method, str):
raise ValueError(
"Allowable 'potential_method' values: 'log' or "
"'sqrt'. '{}' was passed.".format(potential_method))
# else:
# # gamma
# print("Warning: gamma potential is not stable."
# " Recommended values: 'log' or 'sqrt'")
# if potential_method > 1 or potential_method < -1:
# raise ValueError(
# "Allowable 'potential_method' values between -1 and 1"
# " inclusive. '{}' was passed.".format(potential_method))
# elif potential_method != -1:
# diff_potential = 2 / (1 - potential_method) * \
# np.power(X, ((1 - potential_method) / 2))
# else:
# # gamma = -1 is just MDS on DM
# diff_potential = X

if verbose:
print("Calculated diffusion potential in "
Expand Down Expand Up @@ -425,9 +462,9 @@ class PHATE(BaseEstimator):
"""PHATE operator which performs dimensionality reduction.
Potential of Heat-diffusion for Affinity-based Trajectory Embedding
(PHATE).[1]_ Embeds high dimensional single-cell data into two or three
(PHATE) embeds high dimensional single-cell data into two or three
dimensions for visualization of biological progressions as described
in .
in Moon et al, 2017 [1]_.
Parameters
----------
Expand Down Expand Up @@ -466,9 +503,11 @@ class PHATE(BaseEstimator):
log(n_samples) time.
knn_dist : string, optional, default: 'euclidean'
recommended values: 'euclidean' and 'cosine'
recommended values: 'euclidean', 'cosine', 'precomputed'
Any metric from `scipy.spatial.distance` can be used
distance metric for building kNN graph
distance metric for building kNN graph. If 'precomputed',
`data` should be an n_samples x n_samples distance or
affinity matrix
mds_dist : string, optional, default: 'euclidean'
recommended values: 'euclidean' and 'cosine'
Expand Down Expand Up @@ -628,7 +667,9 @@ def fit(self, X):
X : array, shape=[n_samples, n_features]
input data with `n_samples` samples and `n_dimensions`
dimensions. Accepted data types: `numpy.ndarray`,
`scipy.sparse.spmatrix`, `pd.DataFrame`, `anndata.AnnData`
`scipy.sparse.spmatrix`, `pd.DataFrame`, `anndata.AnnData`. If
`knn_dist` is 'precomputed', `data` should be a n_samples x
n_samples distance or affinity matrix
Returns
-------
Expand Down Expand Up @@ -673,7 +714,9 @@ def transform(self, X=None, t_max=100, plot_optimal_t=False, ax=None):
dimensions. Not required, since PHATE does not currently embed
cells not given in the input matrix to `PHATE.fit()`.
Accepted data types: `numpy.ndarray`,
`scipy.sparse.spmatrix`, `pd.DataFrame`, `anndata.AnnData`.
`scipy.sparse.spmatrix`, `pd.DataFrame`, `anndata.AnnData`. If
`knn_dist` is 'precomputed', `data` should be a n_samples x
n_samples distance or affinity matrix
t_max : int, optional, default: 100
maximum t to test if `t` is set to 'auto'
Expand Down Expand Up @@ -733,7 +776,9 @@ def fit_transform(self, X, **kwargs):
X : array, shape=[n_samples, n_features]
input data with `n_samples` samples and `n_dimensions`
dimensions. Accepted data types: `numpy.ndarray`,
`scipy.sparse.spmatrix`, `pd.DataFrame`, `anndata.AnnData`
`scipy.sparse.spmatrix`, `pd.DataFrame`, `anndata.AnnData` If
`knn_dist` is 'precomputed', `data` should be a n_samples x
n_samples distance or affinity matrix
kwargs : further arguments for `PHATE.transform()`
Keyword arguments as specified in :func:`~phate.PHATE.transform`
Expand Down
2 changes: 1 addition & 1 deletion Python/phate/version.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.2.3"
__version__ = "0.2.4"
Loading

0 comments on commit ffb3c24

Please sign in to comment.