Skip to content

Commit

Permalink
v0.3.7
Browse files Browse the repository at this point in the history
  • Loading branch information
niklases committed Nov 21, 2024
1 parent 6a6fc50 commit 2eb5682
Show file tree
Hide file tree
Showing 10 changed files with 111 additions and 27 deletions.
7 changes: 7 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,8 @@ datasets/ANEH/ML_Model_Performance_WEBA780101_ELASTICNET.png
datasets/ANEH/ML_Model_Performance_WEBA780101_PLS_LOOCV.png
datasets/ANEH/ML_Model_Performance_WEBA780101_PLS.png
datasets/ANEH/ML_Model_Performance_WEBA780101_RIDGE.png
datasets/ANEH/ML_Model_Performance_CHAM830104_PLS_LOOCV.png
datasets/ANEH/ML_Model_Performance_OOBM850104_PLS_LOOCV.png
datasets/ANEH/ml_pls_37_ANEH_variants_aaidx_encoded_train_concat_lvls_extrapolation.png
datasets/ANEH/ml_pls_37_ANEH_variants_aaidx_encoded_train_lvl_1_extrapolation.png
datasets/ANEH/ml_pls_37_ANEH_variants_gremlin_dca_encoded_train_concat_lvls_extrapolation.png
Expand Down Expand Up @@ -250,6 +252,11 @@ datasets/ANEH/CV_performance/WEBA780101_PLS_LOOCV_5-fold-CV.png
datasets/ANEH/CV_performance/WEBA780101_PLS_LOOCV_CV_Results.txt
datasets/ANEH/CV_performance/WEBA780101_RIDGE_5-fold-CV.png
datasets/ANEH/CV_performance/WEBA780101_RIDGE_CV_Results.txt
datasets/ANEH/CV_performance/CHAM830104_PLS_LOOCV_5-fold-CV.png
datasets/ANEH/CV_performance/CHAM830104_PLS_LOOCV_CV_Results.txt
datasets/ANEH/CV_performance/OOBM850104_PLS_LOOCV_5-fold-CV.png
datasets/ANEH/CV_performance/OOBM850104_PLS_LOOCV_CV_Results.txt
datasets/ANEH/Pickles/OOBM850104
datasets/ANEH/Diverse_Double_Split/Diverse_Double_Split1.fasta
datasets/ANEH/Diverse_Double_Split/Predictions_FAUJ880104_TopDiverse_Double_Split.txt
datasets/ANEH/Diverse_Double_Split/Predictions_MLgremlin_TopDiverse_Double_Split.txt
Expand Down
30 changes: 25 additions & 5 deletions gui/qt_window.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,12 +124,21 @@ def __init__(
self.button_dca_inference_gremlin = QtWidgets.QPushButton("MSA optimization (GREMLIN)")
self.button_dca_inference_gremlin.setMinimumWidth(80)
self.button_dca_inference_gremlin.setToolTip(
"Generating DCA parameters using GREMLIN (\"MSA optimization\"), "
"you have to provide an MSA in FASTA or A2M format"
"Generating DCA parameters using GREMLIN (\"MSA optimization\"); "
"requires an MSA in FASTA or A2M format"
)
self.button_dca_inference_gremlin.clicked.connect(self.pypef_gremlin)
self.button_dca_inference_gremlin.setStyleSheet(button_style)

self.button_dca_inference_gremlin_msa_info = QtWidgets.QPushButton("GREMLIN SSM prediction")
self.button_dca_inference_gremlin_msa_info.setMinimumWidth(80)
self.button_dca_inference_gremlin_msa_info.setToolTip(
"Generating DCA parameters using GREMLIN (\"MSA optimization\") and save plots of "
"visualized results; requires an MSA in FASTA or A2M format"
)
self.button_dca_inference_gremlin_msa_info.clicked.connect(self.pypef_gremlin_msa_info)
self.button_dca_inference_gremlin_msa_info.setStyleSheet(button_style)

self.button_dca_test_dca = QtWidgets.QPushButton("Test (DCA)")
self.button_dca_test_dca.setMinimumWidth(80)
self.button_dca_test_dca.setToolTip(
Expand Down Expand Up @@ -208,8 +217,9 @@ def __init__(

layout.addWidget(self.dca_text, 3, 1, 1, 1)
layout.addWidget(self.button_dca_inference_gremlin, 4, 1, 1, 1)
layout.addWidget(self.button_dca_test_dca, 5, 1, 1, 1)
layout.addWidget(self.button_dca_predict_dca, 6, 1, 1, 1)
layout.addWidget(self.button_dca_inference_gremlin_msa_info, 5, 1, 1, 1)
layout.addWidget(self.button_dca_test_dca, 6, 1, 1, 1)
layout.addWidget(self.button_dca_predict_dca, 7, 1, 1, 1)

layout.addWidget(self.hybrid_text, 3, 2, 1, 1)
layout.addWidget(self.button_hybrid_train_dca, 4, 2, 1, 1)
Expand All @@ -222,7 +232,7 @@ def __init__(
layout.addWidget(self.button_supervised_train_test_dca, 4, 3, 1, 1)
layout.addWidget(self.button_supervised_train_test_onehot, 5, 3, 1, 1)

layout.addWidget(self.textedit_out, 7, 0, 1, -1)
layout.addWidget(self.textedit_out, 8, 0, 1, -1)

self.process = QtCore.QProcess(self)
self.process.setProcessChannelMode(QtCore.QProcess.MergedChannels)
Expand All @@ -233,6 +243,8 @@ def __init__(
self.process.finished.connect(lambda: self.button_mklsts.setEnabled(True))
self.process.started.connect(lambda: self.button_dca_inference_gremlin.setEnabled(False))
self.process.finished.connect(lambda: self.button_dca_inference_gremlin.setEnabled(True))
self.process.started.connect(lambda: self.button_dca_inference_gremlin_msa_info.setEnabled(False))
self.process.finished.connect(lambda: self.button_dca_inference_gremlin_msa_info.setEnabled(True))
self.process.started.connect(lambda: self.button_dca_test_dca.setEnabled(False))
self.process.finished.connect(lambda: self.button_dca_test_dca.setEnabled(True))
self.process.started.connect(lambda: self.button_dca_predict_dca.setEnabled(False))
Expand Down Expand Up @@ -290,6 +302,14 @@ def pypef_gremlin(self):
self.version_text.setText("Running GREMLIN (DCA) optimization on MSA...")
self.exec_pypef(f'param_inference --wt {wt_fasta_file} --msa {msa_file}') # --opt_iter 100

@QtCore.Slot()
def pypef_gremlin_msa_info(self):
wt_fasta_file = QtWidgets.QFileDialog.getOpenFileName(self, "Select WT FASTA File")[0]
msa_file = QtWidgets.QFileDialog.getOpenFileName(
self, "Select Multiple Sequence Alignment (MSA) file (in FASTA or A2M format)")[0]
if wt_fasta_file and msa_file:
self.version_text.setText("Running GREMLIN (DCA) optimization on MSA...")
self.exec_pypef(f'save_msa_info --wt {wt_fasta_file} --msa {msa_file}')

@QtCore.Slot()
def pypef_dca_test(self):
Expand Down
2 changes: 1 addition & 1 deletion pypef/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,4 +12,4 @@
# Journal of Chemical Information and Modeling, 2021, 61, 3463-3476
# https://doi.org/10.1021/acs.jcim.1c00099

__version__ = '0.3.7-dev'
__version__ = '0.3.7'
3 changes: 2 additions & 1 deletion pypef/dca/dca_run.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
from pypef.utils.variant_data import read_csv, get_wt_sequence
from pypef.dca.plmc_encoding import save_plmc_dca_encoding_model
from pypef.dca.hybrid_model import get_model_and_type, performance_ls_ts, predict_ps, generate_model_and_save_pkl
from pypef.dca.gremlin_inference import save_gremlin_as_pickle, save_corr_csv, plot_all_corr_mtx
from pypef.dca.gremlin_inference import save_gremlin_as_pickle, save_corr_csv, plot_all_corr_mtx, plot_predicted_ssm
from pypef.utils.low_n_mutation_extrapolation import performance_mutation_extrapolation, low_n


Expand Down Expand Up @@ -128,6 +128,7 @@ def run_pypef_hybrid_modeling(arguments):
)
save_corr_csv(gremlin)
plot_all_corr_mtx(gremlin)
plot_predicted_ssm(gremlin)

else:
performance_ls_ts(
Expand Down
66 changes: 61 additions & 5 deletions pypef/dca/gremlin_inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,13 +28,13 @@
References:
[1] Kamisetty, H., Ovchinnikov, S., & Baker, D.
Assessing the utility of coevolution-based residueresidue contact predictions in a
Assessing the utility of coevolution-based residue-residue contact predictions in a
sequence- and structure-rich era.
Proceedings of the National Academy of Sciences, 2013, 110, 15674-15679
https://www.pnas.org/doi/10.1073/pnas.1314045110
[2] Balakrishnan, S., Kamisetty, H., Carbonell, J. G., Lee, S.-I., & Langmead, C. J.
Learning generative models for protein fold families.
Proteins, 79(4), 2011, 106178.
Proteins, 79(4), 2011, 1061-78.
https://doi.org/10.1002/prot.22934
[3] Ekeberg, M., Lövkvist, C., Lan, Y., Weigt, M., & Aurell, E.
Improved contact prediction in proteins: Using pseudolikelihoods to infer Potts models.
Expand All @@ -57,8 +57,9 @@
from scipy.special import logsumexp
from scipy.stats import boxcox
import pandas as pd
from tqdm import tqdm
import tensorflow as tf
tf.get_logger().setLevel('DEBUG')
tf.get_logger().setLevel('WARNING')
# Uncomment to hide GPU devices
#environ['CUDA_VISIBLE_DEVICES'] = '-1'

Expand Down Expand Up @@ -647,6 +648,7 @@ def plot_correlation_matrix(self, matrix_type: str = 'apc', set_diag_zero=True):
ax.set_ylim(-1, matrix.shape[0])
plt.title(matrix_type.upper())
plt.savefig(f'{matrix_type}.png', dpi=500)
logger.info(f"Plotted correlation matrix {os.path.abspath(matrix_type)}.png")
plt.close('all')

def get_top_coevolving_residues(self, wt_seq=None, min_distance=0, sort_by="apc"):
Expand Down Expand Up @@ -718,7 +720,7 @@ def save_gremlin_as_pickle(alignment: str, wt_seq: str, opt_iter: int = 100):
},
open('Pickles/GREMLIN', 'wb')
)
logger.info(f"Saved GREMLIN model as Pickle file ({os.path.abspath('Pickles/GREMLIN')})...")
logger.info(f"Saved GREMLIN model as Pickle file as {os.path.abspath('Pickles/GREMLIN')}...")
return gremlin


Expand All @@ -732,4 +734,58 @@ def save_corr_csv(gremlin: GREMLIN, min_distance: int = 0, sort_by: str = 'apc')
df_mtx_sorted_mindist = gremlin.get_top_coevolving_residues(
min_distance=min_distance, sort_by=sort_by
)
df_mtx_sorted_mindist.to_csv(f"coevolution_{sort_by}_sorted.csv")
df_mtx_sorted_mindist.to_csv(f"coevolution_{sort_by}_sorted.csv", sep=',')
logger.info(f"Saved coevolution CSV data as "
f"{os.path.abspath(f'coevolution_{sort_by}_sorted.csv')}")


def plot_predicted_ssm(gremlin: GREMLIN):
"""
Function to plot all predicted 19 amino acid substitution
effects at all predictable WT/input sequence positions; e.g.:
M1A, M1C, M1E, ..., D2A, D2C, D2E, ..., ..., T300V, T300W, T300Y
"""
wt_sequence = gremlin.wt_seq
wt_score = gremlin.get_wt_score()[0]
aas = "".join(sorted(gremlin.char_alphabet.replace("-", "")))
variantss, variant_sequencess, variant_scoress = [], [], []
logger.info("Predicting all SSM effects using the unsupervised GREMLIN model...")
for i, aa_wt in enumerate(tqdm(wt_sequence)):
variants, variant_sequences, variant_scores = [], [], []
for aa_sub in aas:
variant = aa_wt + str(i + 1) + aa_sub
variant_sequence = wt_sequence[:i] + aa_sub + wt_sequence[i + 1:]
variant_score = gremlin.get_score(variant_sequence)[0]
variants.append(variant)
variant_sequences.append(variant_sequence)
variant_scores.append(variant_score - wt_score)
variantss.append(variants)
variant_sequencess.append(variant_sequences)
variant_scoress.append(variant_scores)

fig, ax = plt.subplots(figsize=(2 * len(wt_sequence) / len(aas), 3))
ax.imshow(np.array(variant_scoress).T)
for i_vss, vss in enumerate(variant_scoress):
for i_vs, vs in enumerate(vss):
ax.text(
i_vss, i_vs,
f'{variantss[i_vss][i_vs]}\n{round(vs, 1)}',
size=1.5, va='center', ha='center'
)
ax.set_xticks(
range(len(wt_sequence)),
[f'{aa}{i + 1}' for i, aa in enumerate(wt_sequence)],
size=6, rotation=90
)
ax.set_yticks(range(len(aas)), aas, size=6)
plt.tight_layout()
plt.savefig('SSM_landscape.png', dpi=500)
pd.DataFrame(
{
'Variant': np.array(variantss).flatten(),
'Sequence': np.array(variant_sequencess).flatten(),
'Variant_Score': np.array(variant_scoress).flatten()
}
).to_csv('SSM_landscape.csv', sep=',')
logger.info(f"Saved SSM landscape as {os.path.abspath('SSM_landscape.png')} "
f"and CSV data as {os.path.abspath('SSM_landscape.csv')}...")
6 changes: 3 additions & 3 deletions pypef/dca/hybrid_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -686,7 +686,6 @@ def save_model_to_dict_pickle(
model_type = 'MODEL'

pkl_path = os.path.abspath(f'Pickles/{model_type}')
logger.info(f'Saving model as Pickle file ({pkl_path})...')
pickle.dump(
{
'model': model,
Expand All @@ -698,6 +697,7 @@ def save_model_to_dict_pickle(
},
open(f'Pickles/{model_type}', 'wb')
)
logger.info(f'Saved model as Pickle file ({pkl_path})...')


global_model = None
Expand Down Expand Up @@ -742,9 +742,9 @@ def plmc_or_gremlin_encoding(
logger.info(f"Following positions are frequent gap positions in the MSA "
f"and cannot be considered for effective modeling, i.e., "
f"substitutions at these positions are removed as these would be "
f"predicted with wild-type fitness:\n{[gap + 1 for gap in model.gaps]}.\n"
f"predicted with wild-type fitness:\n{[int(gap) + 1 for gap in model.gaps]}.\n"
f"Effective positions (N={len(model.v_idx)}) are:\n"
f"{[v_pos + 1 for v_pos in model.v_idx]}")
f"{[int(v_pos) + 1 for v_pos in model.v_idx]}")
xs, x_wt, variants, sequences, ys_true = gremlin_encoding(
model, variants, sequences, ys_true,
shift_pos=1, substitution_sep=substitution_sep
Expand Down
1 change: 1 addition & 0 deletions pypef/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -307,6 +307,7 @@
from pypef.dca.dca_run import run_pypef_hybrid_modeling
from pypef.utils.utils_run import run_pypef_utils


logger = logging.getLogger("pypef")
logger.setLevel(logging.INFO)

Expand Down
19 changes: 9 additions & 10 deletions pypef/ml/regression.py
Original file line number Diff line number Diff line change
Expand Up @@ -649,11 +649,11 @@ def formatted_output(
for (idx, val, val2, val3, val4, val5, r_m, pam) in performance_list:
if val >= minimum_r2:
index.append(get_basename(idx))
value.append('{:f}'.format(val))
value2.append('{:f}'.format(val2))
value3.append('{:f}'.format(val3))
value4.append('{:f}'.format(val4))
value5.append('{:f}'.format(val5))
value.append(f'{val:.6f}')
value2.append(f'{val2:.6f}')
value3.append(f'{val3:.6f}')
value4.append(f'{val4:.6f}')
value5.append(f'{val5:.6f}')
regression_model.append(r_m.upper())
params.append(pam)

Expand Down Expand Up @@ -853,10 +853,9 @@ def crossval_on_all(x_train, x_test, y_train, y_test, regressor: str, parameter,
get_performances(y_test_total, y_predicted_total)

with open(cv_filename, 'a') as f:
f.write('Regression type: {}; Parameter: {}; Encoding index: {}\n'.format(
regressor.upper(), parameter, name))
f.write('R2 = {:.5f}; RMSE = {:.5f}; NRMSE = {:.5f}; Pearson\'s r = {:.5f};'
' Spearman\'s rho = {:.5f}\n\n'.format(r_squared, rmse, nrmse, pearson_r, spearman_rho))
f.write(f'Regression type: {regressor.upper()}; Parameter: {parameter}; Encoding index: {name}\n')
f.write(f'R2 = {r_squared:.6f}; RMSE = {rmse:.6f}; NRMSE = {nrmse:.6f}; Pearson\'s r = {pearson_r:.6f}; '
f'Spearman\'s rho = {spearman_rho:.6f}\n\n')

figure, ax = plt.subplots()
legend = r'$R^2$' + f' = {r_squared:.3f}' + f'\nRMSE = {rmse:.3f}' + f'\nNRMSE = {nrmse:.3f}' + \
Expand Down Expand Up @@ -959,10 +958,10 @@ def save_model(
if model_type in ['PLMC', 'GREMLIN'] and encoding not in ['aaidx', 'onehot']:
name = 'ML' + model_type.lower()
f_name = os.path.abspath(os.path.join(path, 'Pickles', name))
logger.info(f'Saving model ({f_name})...')
file = open(f_name, 'wb')
pickle.dump(regressor_, file)
file.close()
logger.info(f'Saved model as {f_name}...')

except IndexError:
raise IndexError
Expand Down
2 changes: 1 addition & 1 deletion pypef/utils/low_n_mutation_extrapolation.py
Original file line number Diff line number Diff line change
Expand Up @@ -289,8 +289,8 @@ def performance_mutation_extrapolation(
logger.info('Fitting regressor on lvl 1 substitution data...')
regressor.fit(x_train, y_train)
if save_model:
logger.info(f'Saving model as Pickle file: ML_LVL_1')
pickle.dump(regressor, open(os.path.join('Pickles', 'ML_LVL_1'), 'wb'))
logger.info(f'Saved model as Pickle file: ML_LVL_1')
for i, _ in enumerate(tqdm(collected_levels)):
if i < len(collected_levels) - 1: # not last i else error, last entry is: lvl 1 --> all higher variants
test_idx = collected_levels[i + 1]
Expand Down
2 changes: 1 addition & 1 deletion pypef/utils/plot.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,6 @@ def plot_y_true_vs_y_pred(
# i += 1 # iterate until finding an unused file name
# file_name = f'DCA_Hybrid_Model_LS_TS_Performance({i}).png'
plt.colorbar()
logger.info(f'Saving plot ({os.path.abspath(file_name)})...')
plt.savefig(file_name, dpi=500)
plt.close('all')
logger.info(f'Saved plot as {os.path.abspath(file_name)}...')

0 comments on commit 2eb5682

Please sign in to comment.