From 1bdd482fe1b09f1ab21b2e4e77306840eb940ee7 Mon Sep 17 00:00:00 2001
From: niklases <60261497+niklases@users.noreply.github.com>
Date: Fri, 5 Jan 2024 10:39:51 +0100
Subject: [PATCH] Added only --ps option for pure DCA
---
.gitignore | 1 +
.vscode/launch.json | 53 ++++++++++++++++++++++++++++++
.vscode/settings.json | 3 ++
README.md | 17 ++++++++--
pypef/dca/dca_run.py | 2 +-
pypef/dca/hybrid_model.py | 9 +++--
pypef/main.py | 3 +-
scripts/CLI/run_cli_tests_linux.sh | 12 ++++++-
scripts/CLI/run_cli_tests_win.ps1 | 16 ++++++++-
9 files changed, 106 insertions(+), 10 deletions(-)
create mode 100644 .vscode/settings.json
diff --git a/.gitignore b/.gitignore
index e5542be..39aa73a 100644
--- a/.gitignore
+++ b/.gitignore
@@ -387,3 +387,4 @@ scripts/Setup/linux/apc.png
datasets/ANEH/KARS160122_PLS_LOOCV_ML_Model_Performance.png
datasets/ANEH/CV_performance/KARS160122_PLS_LOOCV_5-fold-CV.png
datasets/ANEH/CV_performance/KARS160122_PLS_LOOCV_CV_Results.txt
+datasets/AVGFP/Predictions_Hybrid_TopTS.txt
diff --git a/.vscode/launch.json b/.vscode/launch.json
index b90f5c8..7474635 100644
--- a/.vscode/launch.json
+++ b/.vscode/launch.json
@@ -104,6 +104,23 @@
]
},
+ {
+ "name": "Python: PyPEF hybrid/only-PS-zero-shot GREMLIN-DCA avGFP",
+ "type": "python",
+ "request": "launch",
+ "env": {"PYTHONPATH": "${workspaceFolder}"},
+ "program": "${workspaceFolder}/pypef/main.py",
+ "console": "integratedTerminal",
+ "justMyCode": true,
+ "cwd": "${workspaceFolder}/datasets/AVGFP/",
+ "args": [
+ "hybrid",
+ //"-m", "GREMLIN", // optional, not required
+ "--ps", "TS.fasl",
+ "--params", "GREMLIN"
+ ]
+ },
+
{ // PLMC zero-shot steps:
// 1. $pypef param_inference --params uref100_avgfp_jhmmer_119_plmc_42.6.params
// 2. $pypef hybrid -t TS.fasl --params PLMC
@@ -136,6 +153,42 @@
"--params", "PLMC",
"--threads", "24"
]
+ },
+
+ {
+ "name": "Python: PyPEF hybrid/only-PS-zero-shot PLMC-DCA avGFP",
+ "type": "python",
+ "request": "launch",
+ "env": {"PYTHONPATH": "${workspaceFolder}"},
+ "program": "${workspaceFolder}/pypef/main.py",
+ "console": "integratedTerminal",
+ "justMyCode": true,
+ "cwd": "${workspaceFolder}/datasets/AVGFP/",
+ "args": [
+ "hybrid",
+ //"-m", "PLMC", // optional, not required
+ "--ps", "TS.fasl",
+ "--params", "uref100_avgfp_jhmmer_119_plmc_42.6.params",
+ "--threads", "24"
+ ]
+ },
+
+ {
+ "name": "Python: PyPEF hybrid/only-PS-zero-shot PLMC-DCA variant 2 avGFP",
+ "type": "python",
+ "request": "launch",
+ "env": {"PYTHONPATH": "${workspaceFolder}"},
+ "program": "${workspaceFolder}/pypef/main.py",
+ "console": "integratedTerminal",
+ "justMyCode": true,
+ "cwd": "${workspaceFolder}/datasets/AVGFP/",
+ "args": [
+ "hybrid",
+ //"-m", "PLMC", // optional, not required
+ "--ps", "TS.fasl",
+ "--params", "PLMC",
+ "--threads", "24"
+ ]
}
]
}
\ No newline at end of file
diff --git a/.vscode/settings.json b/.vscode/settings.json
new file mode 100644
index 0000000..2b6b3b4
--- /dev/null
+++ b/.vscode/settings.json
@@ -0,0 +1,3 @@
+{
+ "markdown.extension.toc.updateOnSave": false
+}
diff --git a/README.md b/README.md
index cf86d5c..9959bd4 100644
--- a/README.md
+++ b/README.md
@@ -17,8 +17,9 @@ Preprint available at bioRxiv: https://doi.org/10.1101/2022.06.07.495081.
*§*Equal contribution
---
+
## Table of Contents
-- [PyPEF: Pythonic Protein Engineering Framework](#pypef-pythonic-protein-engineering-framework)
+[PyPEF: Pythonic Protein Engineering Framework](#pypef-pythonic-protein-engineering-framework)
- [Quick Installation](#quick-installation)
- [Requirements](#requirements)
- [Running Examples](#running-examples)
@@ -398,15 +399,19 @@ python3 ./pypef/main.py
```
5.2 After [installing plmc](https://github.com/debbiemarkslab/plmc#compilation), generate the evolutionary coupling file, which is used for encoding sequences. For example, set `-le` to the value output by `sto2a2m`:
+
```
plmc -o ANEH_72.6.params -le 72.6 -m 100 -g -f WT_ANEH ANEH_jhmmer.a2m
```
The output parameter (.params) file can be used for encoding sequences with the DCA-based encoding technique (`-e dca`) by providing it to PyPEF; e.g. for pure ML modeling:
+
```
pypef ml -e dca -l LS.fasl -t TS.fasl --regressor pls --params ANEH_72.6.params
```
+
Or for hybrid modeling:
+
```
pypef hybrid -l LS.fasl -t TS.fasl --params ANEH_72.6.params
```
@@ -420,21 +425,27 @@ To make zero-shot predictions using PyPEF (plmc-DCA or GREMLIN-DCA) just do not
```
pypef param_inference --msa uref100_avgfp_jhmmer_119.a2m
-pypef hybrid -t AVGFP_TS.fasl --params GREMLIN
+pypef hybrid -t TS.fasl --params GREMLIN
```
+
using the GREMLIN parameters, or,
+
```
pypef param_inference --params uref100_avgfp_jhmmer_119_plmc_42.6.params
pypef hybrid -t TS.fasl --params PLMC
```
+
using the plmc parameters.
-Other well-performing zero-shot prediction methods with available source code are (list not complete, see ProteinGym [repository](https://github.com/OATML-Markslab/ProteinGym) and [website](https://proteingym.org/) for a more detailed overview of available methods and achieved performances):
+Other well-performing zero-shot prediction methods with available source code are:
+
- ESM-1v/ESM-2 (https://github.com/facebookresearch/esm)
- DeepSequence (https://github.com/debbiemarkslab/DeepSequence)
- EVcouplings (plmc-DCA, https://github.com/debbiemarkslab/EVcouplings)
- EVE (https://github.com/OATML/EVE)
- Tranception (https://github.com/OATML-Markslab/Tranception)
+
+This list is by no means complete, see ProteinGym [repository](https://github.com/OATML-Markslab/ProteinGym) and [website](https://proteingym.org/) for a more detailed overview of available methods and achieved performances (as well as for getting many benchmark data sets).
## API Usage for Sequence Encoding
diff --git a/pypef/dca/dca_run.py b/pypef/dca/dca_run.py
index e1fa8ca..1447a98 100644
--- a/pypef/dca/dca_run.py
+++ b/pypef/dca/dca_run.py
@@ -61,7 +61,7 @@ def run_pypef_hybrid_modeling(arguments):
label=arguments['--label']
)
- elif arguments['--params'] and arguments['--model']:
+ elif arguments['--params'] and arguments['--model'] or arguments['--ps']:
prediction_dict = {}
prediction_dict.update({
'drecomb': arguments['--drecomb'],
diff --git a/pypef/dca/hybrid_model.py b/pypef/dca/hybrid_model.py
index 0647ba5..509fc0f 100644
--- a/pypef/dca/hybrid_model.py
+++ b/pypef/dca/hybrid_model.py
@@ -1094,11 +1094,14 @@ def predict_ps( # also predicting "pmult" dict directories
in the respective created folders).
"""
- logger.info(f'Taking model from saved model (Pickle file): {model_pickle_file}...')
-
+ if model_pickle_file is None:
+ model_pickle_file = params_file
+ logger.info(f'Trying to load model from saved parameters (Pickle file): {model_pickle_file}...')
+ else:
+ logger.info(f'Loading model from saved model (Pickle file): {model_pickle_file}...')
model, model_type = get_model_and_type(model_pickle_file)
- if model_type == 'PLMC':
+ if model_type == 'PLMC' or model_type == 'GREMLIN':
logger.info(f'No hybrid model provided – falling back to a statistical DCA model.')
elif model_type == 'Hybrid':
beta_1, beta_2, reg = model.beta_1, model.beta_2, model.regressor
diff --git a/pypef/main.py b/pypef/main.py
index 1d312f3..1c35d32 100644
--- a/pypef/main.py
+++ b/pypef/main.py
@@ -145,7 +145,8 @@
pypef shift_pos --input CSV_FILE --offset OFFSET
[--sep CSV_COLUMN_SEPARATOR] [--mutation_sep MUTATION_SEPARATOR] [--fitness_key FITNESS_KEY]
pypef sto2a2m --sto STO_MSA_FILE [--inter_gap INTER_GAP] [--intra_gap INTRA_GAP]
- pypef hybrid --ts TEST_SET
+ pypef hybrid
+ [--ts TEST_SET] [--ps PREDICTION_SET]
[--model MODEL] [--params PARAM_FILE]
[--ls LEARNING_SET] [--label] [--threads THREADS]
pypef hybrid --model MODEL --params PARAM_FILE
diff --git a/scripts/CLI/run_cli_tests_linux.sh b/scripts/CLI/run_cli_tests_linux.sh
index d744e0b..e155447 100644
--- a/scripts/CLI/run_cli_tests_linux.sh
+++ b/scripts/CLI/run_cli_tests_linux.sh
@@ -368,11 +368,21 @@ echo
$pypef hybrid -m PLMC -t TS.fasl --params PLMC --threads $threads
echo
-# pure statistical
+# Hybrid: pure statistical
$pypef hybrid -t TS.fasl --params PLMC --threads $threads
echo
+$pypef hybrid -p TS.fasl --params PLMC --threads $threads
+echo
+# Same as above command
+$pypef hybrid -p TS.fasl -m PLMC --params PLMC --threads $threads
+echo
$pypef hybrid -t TS.fasl --params GREMLIN
echo
+$pypef hybrid -p TS.fasl --params GREMLIN
+echo
+# Same as above command
+$pypef hybrid -p TS.fasl -m GREMLIN --params GREMLIN
+echo
$pypef hybrid -m GREMLIN -t TS.fasl --params GREMLIN
echo
$pypef hybrid -l LS.fasl -t TS.fasl --params GREMLIN
diff --git a/scripts/CLI/run_cli_tests_win.ps1 b/scripts/CLI/run_cli_tests_win.ps1
index daac44a..96f0a40 100644
--- a/scripts/CLI/run_cli_tests_win.ps1
+++ b/scripts/CLI/run_cli_tests_win.ps1
@@ -511,13 +511,27 @@ pypef hybrid -m PLMC -t TS.fasl --params PLMC --threads $threads
ExitOnExitCode
Write-Host
-# pure statistical
+# Hybrid: pure statistical
pypef hybrid -t TS.fasl --params PLMC --threads $threads
ExitOnExitCode
Write-Host
+pypef hybrid -p TS.fasl --params PLMC --threads $threads
+ExitOnExitCode
+Write-Host
+# Same as above command
+pypef hybrid -p TS.fasl -m PLMC --params PLMC --threads $threads
+ExitOnExitCode
+Write-Host
pypef hybrid -t TS.fasl --params GREMLIN
ExitOnExitCode
Write-Host
+pypef hybrid -p TS.fasl --params GREMLIN
+ExitOnExitCode
+Write-Host
+# Same as above command
+pypef hybrid -p TS.fasl -m GREMLIN --params GREMLIN
+ExitOnExitCode
+Write-Host
pypef hybrid -m GREMLIN -t TS.fasl --params GREMLIN
ExitOnExitCode
Write-Host