From 1bdd482fe1b09f1ab21b2e4e77306840eb940ee7 Mon Sep 17 00:00:00 2001 From: niklases <60261497+niklases@users.noreply.github.com> Date: Fri, 5 Jan 2024 10:39:51 +0100 Subject: [PATCH] Added only --ps option for pure DCA --- .gitignore | 1 + .vscode/launch.json | 53 ++++++++++++++++++++++++++++++ .vscode/settings.json | 3 ++ README.md | 17 ++++++++-- pypef/dca/dca_run.py | 2 +- pypef/dca/hybrid_model.py | 9 +++-- pypef/main.py | 3 +- scripts/CLI/run_cli_tests_linux.sh | 12 ++++++- scripts/CLI/run_cli_tests_win.ps1 | 16 ++++++++- 9 files changed, 106 insertions(+), 10 deletions(-) create mode 100644 .vscode/settings.json diff --git a/.gitignore b/.gitignore index e5542be..39aa73a 100644 --- a/.gitignore +++ b/.gitignore @@ -387,3 +387,4 @@ scripts/Setup/linux/apc.png datasets/ANEH/KARS160122_PLS_LOOCV_ML_Model_Performance.png datasets/ANEH/CV_performance/KARS160122_PLS_LOOCV_5-fold-CV.png datasets/ANEH/CV_performance/KARS160122_PLS_LOOCV_CV_Results.txt +datasets/AVGFP/Predictions_Hybrid_TopTS.txt diff --git a/.vscode/launch.json b/.vscode/launch.json index b90f5c8..7474635 100644 --- a/.vscode/launch.json +++ b/.vscode/launch.json @@ -104,6 +104,23 @@ ] }, + { + "name": "Python: PyPEF hybrid/only-PS-zero-shot GREMLIN-DCA avGFP", + "type": "python", + "request": "launch", + "env": {"PYTHONPATH": "${workspaceFolder}"}, + "program": "${workspaceFolder}/pypef/main.py", + "console": "integratedTerminal", + "justMyCode": true, + "cwd": "${workspaceFolder}/datasets/AVGFP/", + "args": [ + "hybrid", + //"-m", "GREMLIN", // optional, not required + "--ps", "TS.fasl", + "--params", "GREMLIN" + ] + }, + { // PLMC zero-shot steps: // 1. $pypef param_inference --params uref100_avgfp_jhmmer_119_plmc_42.6.params // 2. $pypef hybrid -t TS.fasl --params PLMC @@ -136,6 +153,42 @@ "--params", "PLMC", "--threads", "24" ] + }, + + { + "name": "Python: PyPEF hybrid/only-PS-zero-shot PLMC-DCA avGFP", + "type": "python", + "request": "launch", + "env": {"PYTHONPATH": "${workspaceFolder}"}, + "program": "${workspaceFolder}/pypef/main.py", + "console": "integratedTerminal", + "justMyCode": true, + "cwd": "${workspaceFolder}/datasets/AVGFP/", + "args": [ + "hybrid", + //"-m", "PLMC", // optional, not required + "--ps", "TS.fasl", + "--params", "uref100_avgfp_jhmmer_119_plmc_42.6.params", + "--threads", "24" + ] + }, + + { + "name": "Python: PyPEF hybrid/only-PS-zero-shot PLMC-DCA variant 2 avGFP", + "type": "python", + "request": "launch", + "env": {"PYTHONPATH": "${workspaceFolder}"}, + "program": "${workspaceFolder}/pypef/main.py", + "console": "integratedTerminal", + "justMyCode": true, + "cwd": "${workspaceFolder}/datasets/AVGFP/", + "args": [ + "hybrid", + //"-m", "PLMC", // optional, not required + "--ps", "TS.fasl", + "--params", "PLMC", + "--threads", "24" + ] } ] } \ No newline at end of file diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000..2b6b3b4 --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,3 @@ +{ + "markdown.extension.toc.updateOnSave": false +} diff --git a/README.md b/README.md index cf86d5c..9959bd4 100644 --- a/README.md +++ b/README.md @@ -17,8 +17,9 @@ Preprint available at bioRxiv: https://doi.org/10.1101/2022.06.07.495081. *§*Equal contribution
--- + ## Table of Contents -- [PyPEF: Pythonic Protein Engineering Framework](#pypef-pythonic-protein-engineering-framework) +[PyPEF: Pythonic Protein Engineering Framework](#pypef-pythonic-protein-engineering-framework) - [Quick Installation](#quick-installation) - [Requirements](#requirements) - [Running Examples](#running-examples) @@ -398,15 +399,19 @@ python3 ./pypef/main.py ``` 5.2 After [installing plmc](https://github.com/debbiemarkslab/plmc#compilation), generate the evolutionary coupling file, which is used for encoding sequences. For example, set `-le` to the value output by `sto2a2m`: + ``` plmc -o ANEH_72.6.params -le 72.6 -m 100 -g -f WT_ANEH ANEH_jhmmer.a2m ``` The output parameter (.params) file can be used for encoding sequences with the DCA-based encoding technique (`-e dca`) by providing it to PyPEF; e.g. for pure ML modeling: + ``` pypef ml -e dca -l LS.fasl -t TS.fasl --regressor pls --params ANEH_72.6.params ``` + Or for hybrid modeling: + ``` pypef hybrid -l LS.fasl -t TS.fasl --params ANEH_72.6.params ``` @@ -420,21 +425,27 @@ To make zero-shot predictions using PyPEF (plmc-DCA or GREMLIN-DCA) just do not ``` pypef param_inference --msa uref100_avgfp_jhmmer_119.a2m -pypef hybrid -t AVGFP_TS.fasl --params GREMLIN +pypef hybrid -t TS.fasl --params GREMLIN ``` + using the GREMLIN parameters, or, + ``` pypef param_inference --params uref100_avgfp_jhmmer_119_plmc_42.6.params pypef hybrid -t TS.fasl --params PLMC ``` + using the plmc parameters. -Other well-performing zero-shot prediction methods with available source code are (list not complete, see ProteinGym [repository](https://github.com/OATML-Markslab/ProteinGym) and [website](https://proteingym.org/) for a more detailed overview of available methods and achieved performances): +Other well-performing zero-shot prediction methods with available source code are: + - ESM-1v/ESM-2 (https://github.com/facebookresearch/esm) - DeepSequence (https://github.com/debbiemarkslab/DeepSequence) - EVcouplings (plmc-DCA, https://github.com/debbiemarkslab/EVcouplings) - EVE (https://github.com/OATML/EVE) - Tranception (https://github.com/OATML-Markslab/Tranception) + +This list is by no means complete, see ProteinGym [repository](https://github.com/OATML-Markslab/ProteinGym) and [website](https://proteingym.org/) for a more detailed overview of available methods and achieved performances (as well as for getting many benchmark data sets). ## API Usage for Sequence Encoding diff --git a/pypef/dca/dca_run.py b/pypef/dca/dca_run.py index e1fa8ca..1447a98 100644 --- a/pypef/dca/dca_run.py +++ b/pypef/dca/dca_run.py @@ -61,7 +61,7 @@ def run_pypef_hybrid_modeling(arguments): label=arguments['--label'] ) - elif arguments['--params'] and arguments['--model']: + elif arguments['--params'] and arguments['--model'] or arguments['--ps']: prediction_dict = {} prediction_dict.update({ 'drecomb': arguments['--drecomb'], diff --git a/pypef/dca/hybrid_model.py b/pypef/dca/hybrid_model.py index 0647ba5..509fc0f 100644 --- a/pypef/dca/hybrid_model.py +++ b/pypef/dca/hybrid_model.py @@ -1094,11 +1094,14 @@ def predict_ps( # also predicting "pmult" dict directories in the respective created folders). """ - logger.info(f'Taking model from saved model (Pickle file): {model_pickle_file}...') - + if model_pickle_file is None: + model_pickle_file = params_file + logger.info(f'Trying to load model from saved parameters (Pickle file): {model_pickle_file}...') + else: + logger.info(f'Loading model from saved model (Pickle file): {model_pickle_file}...') model, model_type = get_model_and_type(model_pickle_file) - if model_type == 'PLMC': + if model_type == 'PLMC' or model_type == 'GREMLIN': logger.info(f'No hybrid model provided – falling back to a statistical DCA model.') elif model_type == 'Hybrid': beta_1, beta_2, reg = model.beta_1, model.beta_2, model.regressor diff --git a/pypef/main.py b/pypef/main.py index 1d312f3..1c35d32 100644 --- a/pypef/main.py +++ b/pypef/main.py @@ -145,7 +145,8 @@ pypef shift_pos --input CSV_FILE --offset OFFSET [--sep CSV_COLUMN_SEPARATOR] [--mutation_sep MUTATION_SEPARATOR] [--fitness_key FITNESS_KEY] pypef sto2a2m --sto STO_MSA_FILE [--inter_gap INTER_GAP] [--intra_gap INTRA_GAP] - pypef hybrid --ts TEST_SET + pypef hybrid + [--ts TEST_SET] [--ps PREDICTION_SET] [--model MODEL] [--params PARAM_FILE] [--ls LEARNING_SET] [--label] [--threads THREADS] pypef hybrid --model MODEL --params PARAM_FILE diff --git a/scripts/CLI/run_cli_tests_linux.sh b/scripts/CLI/run_cli_tests_linux.sh index d744e0b..e155447 100644 --- a/scripts/CLI/run_cli_tests_linux.sh +++ b/scripts/CLI/run_cli_tests_linux.sh @@ -368,11 +368,21 @@ echo $pypef hybrid -m PLMC -t TS.fasl --params PLMC --threads $threads echo -# pure statistical +# Hybrid: pure statistical $pypef hybrid -t TS.fasl --params PLMC --threads $threads echo +$pypef hybrid -p TS.fasl --params PLMC --threads $threads +echo +# Same as above command +$pypef hybrid -p TS.fasl -m PLMC --params PLMC --threads $threads +echo $pypef hybrid -t TS.fasl --params GREMLIN echo +$pypef hybrid -p TS.fasl --params GREMLIN +echo +# Same as above command +$pypef hybrid -p TS.fasl -m GREMLIN --params GREMLIN +echo $pypef hybrid -m GREMLIN -t TS.fasl --params GREMLIN echo $pypef hybrid -l LS.fasl -t TS.fasl --params GREMLIN diff --git a/scripts/CLI/run_cli_tests_win.ps1 b/scripts/CLI/run_cli_tests_win.ps1 index daac44a..96f0a40 100644 --- a/scripts/CLI/run_cli_tests_win.ps1 +++ b/scripts/CLI/run_cli_tests_win.ps1 @@ -511,13 +511,27 @@ pypef hybrid -m PLMC -t TS.fasl --params PLMC --threads $threads ExitOnExitCode Write-Host -# pure statistical +# Hybrid: pure statistical pypef hybrid -t TS.fasl --params PLMC --threads $threads ExitOnExitCode Write-Host +pypef hybrid -p TS.fasl --params PLMC --threads $threads +ExitOnExitCode +Write-Host +# Same as above command +pypef hybrid -p TS.fasl -m PLMC --params PLMC --threads $threads +ExitOnExitCode +Write-Host pypef hybrid -t TS.fasl --params GREMLIN ExitOnExitCode Write-Host +pypef hybrid -p TS.fasl --params GREMLIN +ExitOnExitCode +Write-Host +# Same as above command +pypef hybrid -p TS.fasl -m GREMLIN --params GREMLIN +ExitOnExitCode +Write-Host pypef hybrid -m GREMLIN -t TS.fasl --params GREMLIN ExitOnExitCode Write-Host