Skip to content

Commit

Permalink
Download ProteinGym data
Browse files Browse the repository at this point in the history
  • Loading branch information
niklases committed Jun 16, 2024
1 parent b6b6463 commit c064905
Show file tree
Hide file tree
Showing 4 changed files with 139 additions and 2 deletions.
8 changes: 8 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,14 @@ workflow/.ipynb_checkpoints/
__pycache__/
*.py[cod]

# Downloaded ProteinGym data
scripts/ProteinGym_runs/DMS_msa_files/
scripts/ProteinGym_runs/DMS_ProteinGym_substitutions/
scripts/ProteinGym_runs/ProteinGym_AF2_structures/
scripts/ProteinGym_runs/multi_point_dms_mut_data.json
scripts/ProteinGym_runs/single_point_dms_mut_data.json
scripts/ProteinGym_runs/_Description_DMS_substitutions_data.csv

# Created test/output files
scripts/Setup/windows/Miniconda3-latest-Windows-x86_64.exe
scripts/Setup/windows/Miniconda3/*
Expand Down
4 changes: 2 additions & 2 deletions pypef/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@
pypef ml -e aaidx -m MODEL -p Prediction_Set.fasta
- Recombinant/diverse prediction files in created prediction set folders:
pypef ml -e aaidx -m MODEL --pmult [--drecomb] [...] [--qdiverse]
- Directed evolution for performing and plotting in silico evolution trajectories:
- Directed evolution - for performing and plotting in silico evolution trajectories:
pypef ml -e aaidx directevo -m MODEL [...]
Note: The commands for hybrid modeling are very similar to the commands for pure ML modeling,
see pypef -h for possible commands.
Expand All @@ -105,7 +105,7 @@
Helpful commands for data conversion
-----------------------------------------------
Creation of learning and test sets splitting CSV variant-fitness data:
Creation of learning and test sets - splitting CSV variant-fitness data:
pypef mklsts --wt WT_FASTA --input CSV_FILE
[--drop THRESHOLD] [--numrnd NUMBER]
Expand Down
3 changes: 3 additions & 0 deletions scripts/ProteinGym_runs/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
## Benchmark runs on publicly available ProteinGym protein variant sequence-fitness datasets

Data is taken (script-based download) from MSA Assays data from https://proteingym.org/download.
126 changes: 126 additions & 0 deletions scripts/ProteinGym_runs/download_proteingym_and_extract_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
import os
import urllib.request
import zipfile
import pandas as pd
import json
# To use unverified ssl you can add this to your code, taken from:
# https://stackoverflow.com/questions/50236117/scraping-ssl-certificate-verify-failed-error-for-http-en-wikipedia-org
#import ssl
#ssl._create_default_https_context = ssl._create_unverified_context



def download_proteingym_data():
url = 'https://marks.hms.harvard.edu/proteingym/DMS_substitutions.csv'
print(f'Getting {url}...')
urllib.request.urlretrieve(url, os.path.join(os.path.dirname(__file__), '_Description_DMS_substitutions_data.csv'))

url = 'https://marks.hms.harvard.edu/proteingym/DMS_ProteinGym_substitutions.zip'
print(f'Getting {url}...')
urllib.request.urlretrieve(url, os.path.join(os.path.dirname(__file__), 'DMS_ProteinGym_substitutions.zip'))
with zipfile.ZipFile(os.path.join(os.path.dirname(__file__), 'DMS_ProteinGym_substitutions.zip'), "r") as zip_ref:
zip_ref.extractall(os.path.join(os.path.dirname(__file__), 'DMS_ProteinGym_substitutions'))
os.remove(os.path.join(os.path.dirname(__file__), 'DMS_ProteinGym_substitutions.zip'))

url = 'https://marks.hms.harvard.edu/proteingym/DMS_msa_files.zip'
print(f'Getting {url}...')
urllib.request.urlretrieve(url, os.path.join(os.path.dirname(__file__), 'DMS_msa_files.zip'))
with zipfile.ZipFile(os.path.join(os.path.dirname(__file__), 'DMS_msa_files.zip'), "r") as zip_ref:
zip_ref.extractall(os.path.join(os.path.dirname(__file__), 'DMS_msa_files'))
os.remove(os.path.join(os.path.dirname(__file__), 'DMS_msa_files.zip'))

url = 'https://marks.hms.harvard.edu/proteingym/ProteinGym_AF2_structures.zip'
print(f'Getting {url}...')
urllib.request.urlretrieve(url, os.path.join(os.path.dirname(__file__), 'ProteinGym_AF2_structures.zip'))
with zipfile.ZipFile(os.path.join(os.path.dirname(__file__), 'ProteinGym_AF2_structures.zip'), "r") as zip_ref:
zip_ref.extractall(os.path.join(os.path.dirname(__file__), 'ProteinGym_AF2_structures'))
os.remove(os.path.join(os.path.dirname(__file__), 'ProteinGym_AF2_structures.zip'))


def get_single_or_multi_point_mut_data(csv_description_path, datasets_path=None, msas_path=None, pdbs_path=None, single: bool = True):
"""
Get ProteinGym data, here only the single or multi-point mutant data (all data for
that target dataset having single- or multi-point mutated variants available).
Reads the dataset description/overview CSV to search for available data in
the 'DMS_ProteinGym_substitutions' sub-directory.
"""
if single:
type_str = 'single'
else:
type_str = 'multi'
file_dirname = os.path.abspath(os.path.dirname(__file__))
if datasets_path is None:
datasets_path = os.path.join(file_dirname, 'DMS_ProteinGym_substitutions')
if msas_path is None:
msas_path = os.path.join(file_dirname, 'DMS_msa_files') # used to be DMS_msa_files/MSA_files/DMS
msas = os.listdir(msas_path)
if pdbs_path is None:
pdbs_path = os.path.join(file_dirname, 'ProteinGym_AF2_structures')
pdbs = os.listdir(pdbs_path)
description_df = pd.read_csv(csv_description_path, sep=',')
i_mps = []
for i, n_mp in enumerate(description_df['DMS_number_multiple_mutants'].to_list()):
if description_df['MSA_start'][i] == 1: # TODO: Else shift WT seq by description_df['MSA_start']]
if n_mp > 0:
if not single:
i_mps.append(i)
else:
if single:
i_mps.append(i)
else:
pass
mp_description_df = description_df.iloc[i_mps, :]
mp_filenames = mp_description_df['DMS_filename'].to_list()
mp_wt_seqs = mp_description_df['target_seq'].to_list()
print(f'Searching for CSV files in {datasets_path}...')
csv_paths = [os.path.join(datasets_path, mp_filename) for mp_filename in mp_filenames]
print(f'Found {len(csv_paths)} {type_str}-point datasets, will check if all are available in datasets folder...')
avail_filenames, avail_csvs, avail_wt_seqs = [], [], []
for i, csv_path in enumerate(csv_paths):
if not os.path.isfile(csv_path):
# Used to be an error in files: CHECK: Likely 'Rocklin' mistake in CSV! Should be Tsuboyama(?)
print(f"Did not find CSV file {csv_path} - will remove it from prediction process!")
else:
avail_csvs.append(csv_path)
avail_wt_seqs.append(mp_wt_seqs[i])
avail_filenames.append(os.path.splitext(mp_filenames[i])[0])
print(csv_paths[0])
assert len(avail_wt_seqs) == len(avail_csvs)
print(f'Getting data from {len(avail_csvs)} {type_str}-point mutation DMS CSV files...')
dms_mp_data = {}
for i, csv_path in enumerate(avail_csvs):
#df = pd.read_csv(csv_path, sep=',')
begin = avail_filenames[i].split('_')[0] + '_' + avail_filenames[i].split('_')[1]
msa_path=None
for msa in msas:
if msa.startswith(begin):
msa_path = os.path.join(msas_path, msa)
for pdb in pdbs:
if pdb.startswith(begin):
pdb_path = os.path.join(pdbs_path, pdb)
if msa_path is None or pdb_path is None:
continue
dms_mp_data.update({
avail_filenames[i]: {
'CSV_path': csv_path,
'WT_sequence': avail_wt_seqs[i],
'MSA_path': msa_path,
'PDB_path': pdb_path
#'DF': df
}
})
return dms_mp_data


if __name__ == '__main__':
#download_proteingym_data()
single=True
if single:
type_str='single'
else:
type_str='multi'
mut_data = get_single_or_multi_point_mut_data(os.path.join(os.path.dirname(__file__), '_Description_DMS_substitutions_data.csv'), single=single)
json_output_file = os.path.abspath(os.path.join(os.path.dirname(__file__), f"{type_str}_point_dms_mut_data.json"))
with open(json_output_file, 'w') as fp:
json.dump(mut_data, fp, indent=4)
print(f"Stored data and saved {type_str}-point data information as Pickle file as {json_output_file}.")

0 comments on commit c064905

Please sign in to comment.