get_snapshot_configs.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Thu Aug 16 08:13:37 2018

Loops through desired snapshots. For each native substructure, computes average
distance between residues participating in that substrucutre in the current
snapshot. This value is also refered to as a "score"

In general, you can apply substructures from a protein of length L to analyze 
a smaller protein of length L'<=L (e.g. a truncation of the larger protein)
If so, then we truncate the contact map/substructures at L' residues

Example instances, assuming the directory to analyze is located at a path ../1igd/MultiUmbrella17:
python get_snapshot_configs.py --directory=../1igd/MultiUmbrella17 --root='1igd' --native_file=../1igd/files/1igd_0.200_20_Emin.pdb --d_cutoff=6.5 --min_seq_separation=3 --contact_sep_thresh=7 
python get_snapshot_configs.py --directory=../1igd/MultiUmbrella17 --root='1igd' --native_file=../1igd/files/1igd_0.200_20_Emin.pdb --substructure_file=1igd_data/Substructures.dat 
python get_snapshot_configs.py --directory=../1igd/MultiUmbrella17 --root='1igd' --native_file=../1igd/files/1igd_0.200_20_Emin.pdb --substructure_file=1igd_data/Substructures.dat --temperatures='0.800, 0.825' --output_filename='SS_test.dat'
python get_snapshot_configs.py --directory=../1igd/MultiUmbrella17 --root='1igd' --native_file=../1igd/files/1igd_0.200_20_Emin.pdb --substructure_file=1igd_data/Substructures.dat --temperatures='0.800, 0.825' --output_filename='SS_test.dat' --min_step=0 --max_step=100000000

@author: amirbitran
"""


import numpy as np
import joblib
import dbfold
import dbfold.analyze_structures
import dbfold.utils
import glob
import argparse


parser = argparse.ArgumentParser(description='Hi')
parser.add_argument("--directory", help = 'This is the path to the directory containing the PDB files we want to analyze...' )
parser.add_argument("--root", help = 'The root for PDB files...all PDB files will be named {root}_{temperature}_{setpoint}.{MC timestep} for equilibrium simulations, and {root}_{temperature}_{traj number}.{MC timestep} for unfolding simulations')
parser.add_argument("--native_file", help = 'Path to the native file for the protein whose snapshots you want to analyze. Must be specified even if you provide your own substructures in advance')


parser.add_argument("--d_cutoff",  default = '6.5', help = 'distance cutoff (in angstroms) used to compute contacts. If not specified, then --substructure_file must be specifed instead so that pre-computed substructures can be used. Defaults to 6.5.')
parser.add_argument("--min_seq_separation",  default = '8', help = 'min separation in sequence between two residues for them to be counted as being in contact. If not specified, then --substructure_file must be specifed instead so that pre-computed substructures can be used. Defaults to 8')
parser.add_argument("--min_clustersize", default = '5', help = 'mininmum size of island of contacts for it to be considered a substructure. If not specified, then --substructure_file must be specifed instead so that pre-computed substructures can be used. Defaults to 5')
parser.add_argument("--contact_sep_thresh",  default = '5', help = 'two contacts cannot be separated by a Manhattan distance greater than this on the contact map for them to be grouped into an island. If not specified, then --substructure_file must be specifed instead so that pre-computed substructures can be used. Defaults to 5')

parser.add_argument("--substructure_file", required = False, help = 'Path to a .dat file which contains substructures and native distances. This could have been generated by creating a Protein class and running generate_subs. If this is not specified, then the arguments --d_cutoff, --min_seq_separation, --min_islesize, and --max_contact_sep must be specified instead so that substructures can be computed de novo. Note that if a substructure_file is provided, the d_cutoff and min_seq_separation values there take precedence over any entered as command-line arguments')


parser.add_argument("--min_step", default = '0', help = 'minimum MC step to analyze. Defaults to 0.')
parser.add_argument("--max_step", default = 'inf', help = 'maximum MC step to analyze. Defaujlts to infinity.')


parser.add_argument("--temperatures", default = "*.***", type = str, help = "Temperatures at which you want to run analysis, as a comma-separated string. For instnace you can type --temperatures='0.800, 0.900' or --temperatures = '0.8**' or --temperatures='*.***', the latter being the default ")

parser.add_argument("--output_filename", default = "Equilibrium_scores.dat", help = "A file with this name, which contains scores for all snapshots, will be saved in directory. Defaults to Equilibrium_scores.dat ")

args = parser.parse_args()

directory = args.directory
fileroot = args.root
native_file=args.native_file
d_cutoff = float(args.d_cutoff)
min_seq_separation=int(args.min_seq_separation)
min_clustersize=int(args.min_clustersize)
contact_sep_thresh=int(args.contact_sep_thresh)
min_step=float(args.min_step)
max_step=float(args.max_step)
temperatures = [item for item in args.temperatures.split(',')]
output_filename=args.output_filename

if args.substructure_file:
    substructure_file=args.substructure_file
    print('Pre-made substructures provided in {}'.format(substructure_file))
    native_substructures, native_distances, d_cutoff, min_seq_separation = joblib.load(substructure_file)
    print('Pre-made substructures loaded..')
    #native_contacts=np.zeros(np.shape(native_distances))
    #native_contacts[np.where((native_distances<d_cutoff) & (native_distances!=0))]=1
    #note, by native_distances and native_contacts, we actually mean the distances/contacts that are found in the protein used to produce the substructure file
	#So for instance, if the substructure file is the full version of the protein of interest, then we define the contacts/distnaces in the full protein as the "native" ones
	#now, load the protein of interest's contacts to figure out if we need to truncate the native contacts	
    zz = dbfold.analyze_structures.find_native_contacts(native_file, d_cutoff, min_seq_separation)
    length=np.shape(zz)[0]
    if length<np.shape(native_substructures)[0]:
        native_substructures = native_substructures[0:length, 0:length, :]  #only keep whatever portions of substructures are acutally found in the current protein
        native_distances=native_distances[0:length, 0:length]  #same for native distances and contacts
        #native_contacts=native_contacts[0:length, 0:length]
else:
    print('Identifying substructures de novo!')
    native_contacts, native_substructures=dbfold.analyze_structures.identify_native_substructures(native_file, d_cutoff, min_seq_separation, contact_sep_thresh, min_clustersize, plot=False )
    native_distances=dbfold.analyze_structures.find_native_contacts(native_file, d_cutoff, min_seq_separation, mode = 'distances')
    print('{} snapshots identified!'.format(np.shape(native_substructures)[2]))


PDB_files=[]
for temp in temperatures: PDB_files+=glob.glob('{}/{}_{}*.*0'.format(directory, fileroot, temp))
times = dbfold.utils.get_times(PDB_files)
PDB_files = [file for f, file in enumerate(PDB_files) if times[f]>=min_step and times[f]<max_step ]
     
scores=np.zeros((len(PDB_files), np.shape(native_substructures)[2]))  #scores[f, n] tells you, for file f, what is the average distance between alpha carbons that were assigned to native substructure n    
print('Time to assign scores! \n {} files total'.format(len(PDB_files)))
for f, file in enumerate(PDB_files):
    if np.mod(f,500)==0:
        print('{} files completed'.format(f))
    scores[f,:], distances = dbfold.analyze_structures.score_snapshot(file, native_substructures, min_seq_separation = min_seq_separation)


joblib.dump([scores,  PDB_files, native_distances, native_substructures], open('{}/{}'.format(directory, output_filename), "wb"), compress=3 )
print('Scores obtained and saved! Have a good day!')