RosettaCommons · Alisa-Kh · Mar 24, 2019 · Mar 24, 2019
diff --git a/peptide_docking/README b/peptide_docking/README
@@ -0,0 +1,59 @@
+Global peptide docking using PIPER-FlexPepDock:
+This is the stand alone version of the PIPER-FlexPepDock protocol, as implemented in the server at
+http://piperfpd.furmanlab.cs.huji.ac.il/
+
+The script runs the PIPER-FlexPepDock protocol, using Rosetta version 2018.07 and the 'ref2015' scoring function
+(benchmarked to reproduce similar results to the original version published in
+http://journals.plos.org/ploscompbiol/article?id=10.1371/journal.pcbi.1005905#sec030.)
+To restore this original version, add '--restore_talaris_behaviour' option and make sure you have Rosetta 2016.20
+release compiled.
+
+To run this script:
+
+(1) Download PIPER (https://cluspro.bu.edu)
+
+(2) Change paths in pfpd_const.py (your rosetta directory, and the directory with these scripts)
+
+(3) Adapt the script to your cluster environment:
+After the input fragments are ready, 50 PIPER jobs need to be executed (it is highly recommended to run them in
+parallel). After all of them are done, refinement of 12500 models (50x250 top PIPER models) will be performed
+with Rosetta FlexPepDock.
+We use the SLURM workload manager to run jobs on our cluster. All the SLURM dependent functions are concentrated
+in the slurm_jobs.py file; you will need to adapt them to your job manager.
+
+Options:
+Two mandatory arguments: receptor '-r' and peptide sequence '-p'
+
+If a native structure is available pass it as an argument to the script with the '--native' flag.
+
+For receptor minimization add the '--receptor_min' flag.
+
+To restore the protocol published in http://journals.plos.org/ploscompbiol/article?id=10.1371/journal.pcbi.1005905#sec030
+add the '--restore_talaris_behaviour' option and make sure you have Rosetta 2016.20 release compiled.
+
+If the secondary structure of the peptide is known, pass the file (format: 'LLLEEEHHH') with the '--sec_struct' option.
+
+If the secondary structure is not known, for more accurate prediction of peptide secondary structure you can
+pass a full protein FASTA from which the peptide is derived (or a longer sequence) with the '--pep_from_fasta' option,
+so the prediction will be done based on a longer sequence, and the motif will be cut from it.
+
+Sometimes the prediction can be improved by extending the fragment library with helices and beta-strands in addition
+to the fragments of predicted secondary structure. To extend the library, add the '--add_alpha_beta' option.
+
+References:
+Please cite the following when referring to results of this protocol:
+
+Alam, Goldstein, Xia, Porter, Kozakov, Schueler-Furman. High-resolution global peptide-protein docking using
+fragments-based PIPER-FlexPepDock. (2017) PLoS CB 13:e1005905
+
+Porter KA, Xia B, Beglov D, Bohnuud T, Alam N, Schueler-Furman O, Kozakov D. ClusPro PeptiDock: Efficient global docking
+of peptide recognition motifs using FFT. (2017) Bioinformatics btx216
+
+Raveh B, London N, Schueler-Furman O. Sub-angstrom modeling of complexes between flexible peptides and globular
+proteins. (2010) Proteins 78:2029-40
+
+Kozakov D, Brenke R, Comeau SR, Vajda S. PIPER: An FFT-based protein docking program with pairwise potentials.
+(2006) Proteins : Structure, Function, and Bioinformatics 65:392–406
+
+The code is written by:
+[email protected]
diff --git a/peptide_docking/clustering.py b/peptide_docking/clustering.py
@@ -0,0 +1,126 @@
+#!/usr/bin/python
+
+import sys
+import os
+import pfpd_const as protocol
+import math
+
+HEADER = 'DecoyID\t\tClusterN MemberID\tI_sc\tReweighted_sc'
+FINAL_DIR = '../../FINAL_RESULTS'
+CLUSTERING_DIR = os.getcwd()
+
+
+def create_pdb_list():
+    with open(sc_file, 'r') as score_file:
+        scores = score_file.readlines()
+        header = scores[1].split()
+        clean_scores = []
+        for line in scores:
+            if len(line.strip().split()) > 1 and line.split()[1] != 'total_score':
+                clean_scores.append(line)
+
+        reweighted_column = header.index('reweighted_sc')
+        description_column = header.index('description')
+
+        total_decoys = len(clean_scores)
+        clustering_pool_num = int(total_decoys / 100)
+
+    with open('pdb_list', 'w') as pdbs:
+        i = 0
+        for line in sorted(clean_scores, key=lambda sc_line: float(sc_line.split()[reweighted_column])):
+            pdbs.write(line.split()[description_column] + '\n')
+            i += 1
+            if i >= clustering_pool_num:
+                break
+    return clustering_pool_num, clean_scores
+
+
+def define_actual_r():
+    with open(native, 'r') as n:
+        calphas = 0
+        pep_calphas = 0
+        for line in n:
+            if line[13:15] == 'CA':
+                calphas += 1
+                if line[21] == 'B':
+                    pep_calphas += 1
+    actual_r = math.sqrt(float(pep_calphas) / float(calphas)) * float(radius)
+    print('Actual radius is ' + str(actual_r))
+    return actual_r
+
+
+def run_clustering(actual_r):
+    os.system(os.path.join(protocol.ROSETTA_BIN, 'cluster.linuxgccrelease') +
+              ' -in:file:silent {decoys} -in:file:silent_struct_type binary -database {DB}'
+              ' -cluster:radius {actualR} -in:file:fullatom -tags `cat pdb_list`'
+              ' -silent_read_through_errors > clog'.format(decoys=silent_input,
+                                                           DB=protocol.ROSETTA_DB, actualR=actual_r))
+
+
+def results_processing():
+    """Write sorted scores"""
+    print('Now printing results')
+    with open('clog', 'r') as log:
+        whole_log = log.readlines()
+    with open('cluster_list', 'w') as cluster_lst:
+        decoys_lst = []
+        final_lines_idx = whole_log.index('Timing: \n')
+        relevant_lines = whole_log[final_lines_idx - clustering_pool:final_lines_idx]
+        for line in relevant_lines:
+            cluster_lst.write(line.split()[2] + '\t' + line.split()[3] + '\t' + line.split()[4] + '\n')
+            decoys_lst.append(line.split()[2] + '\t' + line.split()[3] + '\t' + line.split()[4])
+    with open(sc_file, 'r') as score_file:
+        score_file.readline()  # 'SEQUENCE' line
+        header = score_file.readline().split()
+    interface_sc_idx = header.index('I_sc')
+    reweighted_sc_idx = header.index('reweighted_sc')
+    description_idx = header.index('description')
+    with open('cluster_list_sc', 'w') as cluster_lst_sc:
+        cluster_lst_sc.write(HEADER + '\n')
+        for decoy_line in decoys_lst:
+            for scores_line in score_lines:
+                if scores_line.split()[description_idx] == decoy_line.split()[0]:
+                    cluster_lst_sc.write(decoy_line + '\t' + scores_line.split()[interface_sc_idx] + '\t' +
+                                         scores_line.split()[reweighted_sc_idx] + '\n')
+
+    os.system('echo "{}" >cluster_list_I_sc_sorted'.format(HEADER))
+    os.system('sort -nk 4 cluster_list_sc | sort -u -k2,2 | '
+              'sort -nk 4 | head -20 >>cluster_list_I_sc_sorted')
+
+    os.system('echo "{}" >cluster_list_reweighted_sc_sorted'.format(HEADER))
+    os.system('sort -nk 5 cluster_list_sc | sort -u -k2,2 | '
+              'sort -nk 5 | head -20 >>cluster_list_reweighted_sc_sorted')
+
+
+def collect_results():
+    if not os.path.exists(FINAL_DIR):
+        os.makedirs(FINAL_DIR)
+    os.system(protocol.COPY.format(os.path.join(CLUSTERING_DIR, 'cluster_list_reweighted_sc_sorted'),
+                                   FINAL_DIR))
+    with open(os.path.join(FINAL_DIR, 'cluster_list_reweighted_sc_sorted')) as top_reweighted:
+        top_reweighted.readline()
+        for i in range(10):
+            cur_line = top_reweighted.readline().split()
+            struct = 'c.{cluster}.{member}.pdb'.format(cluster=cur_line[1], member=cur_line[2])
+            os.system(protocol.COPY.format(os.path.join(CLUSTERING_DIR, struct), FINAL_DIR))
+    for gz_file in os.listdir('../'):
+        if os.path.isfile(gz_file) and os.path.splitext(os.path.basename(gz_file))[1] == '.gz':
+            os.remove(gz_file)
+
+
+if __name__ == "__main__":
+
+    radius = sys.argv[1]
+    native = sys.argv[2]
+    silent_input = sys.argv[3]
+
+    if os.path.isfile('../rescore.sc'):
+        sc_file = '../rescore.sc'
+    else:
+        sc_file = '../score.sc'
+
+    clustering_pool, score_lines = create_pdb_list()
+    actual_radius = define_actual_r()
+    run_clustering(actual_radius)
+    results_processing()
+    collect_results()
diff --git a/peptide_docking/pfpd_const.py b/peptide_docking/pfpd_const.py
@@ -0,0 +1,162 @@
+#!/usr/bin/python3
+
+import os
+
+#########################
+""" Change paths here """
+#########################
+
+ROSETTA_DIR = '/vol/ek/Home/alisa/rosetta/Rosetta/'
+ROSETTA_2016_DIR = '/vol/ek/share/rosetta/rosetta_src_2016.20.58704_bundle/'
+
+ROSETTA_DB = os.path.join(ROSETTA_DIR, 'main/database')
+ROSETTA_2016_DB = os.path.join(ROSETTA_2016_DIR, 'main/database')
+
+ROSETTA_BIN = os.path.join(ROSETTA_DIR, 'main/source/bin/')
+ROSETTA_2016_BIN = os.path.join(ROSETTA_2016_DIR, 'main/source/bin/')
+
+ROSETTA_TOOLS = os.path.join(ROSETTA_DIR, 'tools/')
+
+PFPD_SCRIPTS = os.path.join(ROSETTA_TOOLS, 'peptide_docking/')
+
+PIPER_DIR = '/vol/ek/Home/alisa/PIPER/'
+
+######################################
+"""DO NOT change following commands"""
+######################################
+
+# Commands (ROSETTA)
+
+CLEAN_PDB = os.path.join(ROSETTA_TOOLS, 'protein_tools/scripts/clean_pdb.py >> pdb_log ') + ' {} {}'
+
+FIXBB_JD3 = 'mpirun -n 6 ' + os.path.join(ROSETTA_BIN, 'fixbb_jd3.mpiserialization.linuxgccrelease') + \
+            ' -database ' + ROSETTA_DB + ' -in:file:job_definition_file {} > fixbb.log'
+FIXBB_JD3_TALARIS = 'mpirun -n 6 ' + os.path.join(ROSETTA_BIN, 'fixbb_jd3.mpiserialization.linuxgccrelease') + \
+                    ' -database ' + ROSETTA_DB + ' -restore_talaris_behavior ' \
+                                                 '-in:file:job_definition_file {} > fixbb.log'
+
+FIXBB = os.path.join(ROSETTA_BIN, 'fixbb.linuxgccrelease') + ' -database ' + ROSETTA_DB + \
+        ' -in:file:s {frag} -resfile {resfile} -ex1 -ex2 -use_input_sc -scorefile design_score.sc >design.log'
+FIXBB_TALARIS = os.path.join(ROSETTA_2016_BIN, 'fixbb.linuxgccrelease') + ' -database ' + ROSETTA_2016_DB + \
+                ' -in:file:s {frag} -resfile {resfile} -ex1 -ex2 -use_input_sc -scorefile design_score.sc >design.log'
+
+BUILD_PEPTIDE = os.path.join(ROSETTA_BIN, 'BuildPeptide.linuxgccrelease') + ' -in:file:fasta {}' \
+                                                                            ' -database ' + ROSETTA_DB + \
+                ' -out:file:o peptide.pdb > build_peptide.log'
+
+MAKE_FRAGMENTS = 'perl ' + os.path.join(ROSETTA_TOOLS, 'fragment_tools/make_fragments.pl') + \
+                 ' -verbose -id xxxxx {} 2>log'
+
+FRAG_PICKER = os.path.join(ROSETTA_BIN, 'fragment_picker.linuxgccrelease') + \
+              ' -database ' + ROSETTA_DB + ' @flags >makeFrags.log'
+
+PREPACK = os.path.join(ROSETTA_BIN, 'FlexPepDocking.default.linuxgccrelease') + \
+          ' -database ' + ROSETTA_DB + ' @prepack_flags >ppk.log'
+PREPACK_TALARIS = 'mpirun -n 5 ' + os.path.join(ROSETTA_2016_BIN, 'FlexPepDocking.mpi.linuxgccrelease') + \
+                  ' -database ' + ROSETTA_2016_DB + ' @prepack_flags >ppk.log'
+
+FPD = 'ls *gz >input_list\n' \
+      'mpirun ' + os.path.join(ROSETTA_BIN, 'FlexPepDocking.mpiserialization.linuxgccrelease') + \
+      ' -database ' + ROSETTA_DB + ' @{flags} >>refinement_log'
+FPD_TALARIS = 'ls *gz >input_list\n' \
+              'mpirun ' + os.path.join(ROSETTA_2016_BIN, 'FlexPepDocking.mpi.linuxgccrelease') + \
+              ' -database ' + ROSETTA_2016_DB + ' @{flags} >>refinement_log'
+
+EXTRACT_MODEL = 'python ' + PFPD_SCRIPTS + 'extract_top_model.py'
+RESCORING = 'python ' + PFPD_SCRIPTS + 'rescoring.py {sc_func} {rec}'
+CLUSTERING = 'python ' + PFPD_SCRIPTS + 'clustering.py 2.0 {native} {decoys}'
+
+# Commands (PIPER)
+
+PREP_PDB = PIPER_DIR + 'protein_prep/prepare.py {}'
+
+
+########################## Run PIPER and get collect structures fot refinement###################################
+# These commands are executed from the single run_piper script
+
+PIPER_DOCKING = PIPER_DIR + "piper -vv -c1.0 -k4 --msur_k=1.0 --maskr=1.0 -T FFTW_EXHAUSTIVE -R {decoys} -t 1 -p " \
+                + PIPER_DIR + "prms/atoms.prm -f " + PIPER_DIR + "prms/coeffs.prm -r " + PIPER_DIR + \
+                "prms/rots.prm {r} {l} >piper.log\n"
+
+EXTRACT_DECOYS = "for f in `awk '{print $1}' ft.000.00 | head -%s`;" \
+                 "do if [ ! -f {f}.pdb ]; then " + PIPER_DIR + "apply_ftresult.py -i $f ft.000.00 " \
+                 + PIPER_DIR + "prms/rots.prm %s --out-prefix $f;fi;done\n"
+
+PREP_FPD_INPUTS = "piper_run=`pwd | awk -F'/' '{print $NF}'`\n" \
+                  "for f in `ls [0-9]*.pdb`;" \
+                  "do cat %s ${f} | grep ATOM >%s/${piper_run}.${f};gzip %s/${piper_run}.${f};done"
+
+#################################################################################################################
+
+# Other commands
+
+COPY = 'cp {} {}'
+
+
+# Other constants
+
+FRAGS_NUM = 50  # default = 50
+PIPER_MODELS_EXTRACTED = str(250)  # default = 250
+N_ROTS = str(70000)  # default = 70000
+WINDOWS_LENGTH = 6  # default = 6
+
+FRAGS_FILE = 'frags.100.{}mers'
+THREE_TO_ONE_AA = {'G': 'GLY',
+                   'A': 'ALA',
+                   'V': 'VAL',
+                   'L': 'LEU',
+                   'I': 'ILE',
+                   'P': 'PRO',
+                   'C': 'CYS',
+                   'M': 'MET',
+                   'H': 'HIS',
+                   'F': 'PHE',
+                   'Y': 'TYR',
+                   'W': 'TRP',
+                   'N': 'ASN',
+                   'Q': 'GLN',
+                   'S': 'SER',
+                   'T': 'THR',
+                   'K': 'LYS',
+                   'R': 'ARG',
+                   'D': 'ASP',
+                   'E': 'GLU'}
+
+PSIPRED_OUTPUT = ['C', 'E', 'H']
+
+PDB = '{}_{}.pdb'
+FASTA = '{}_{}.fasta'
+BAD_NATIVE = "Warning: The native structure and the receptor differ in sequence"
+
+# Common functions
+
+
+def count_pdbs(folder):
+    """Count files in a directory"""
+    return len([frag for frag in os.listdir(folder) if
+                os.path.isfile(os.path.join(folder, frag)) and
+                os.path.splitext(os.path.basename(frag))[1] == '.pdb'])
+
+
+def count_dirs(folder):
+    """Count directories in a directory"""
+    return len([d for d in os.listdir(folder) if
+                os.path.isdir(os.path.join(folder, d))])
+
+
+def rename_chain(structure, chain_id):
+    """Rename chain id of a structure"""
+    renamed_struct = []
+    with open(structure, 'r') as pdb:
+        pdb_lines = pdb.readlines()
+    for line in pdb_lines:
+        if line[0:4] != 'ATOM' and line[0:6] != 'HETATM':
+            continue
+        else:
+            new_line = list(line)
+            new_line[21] = chain_id
+            renamed_struct.append(''.join(new_line))
+    os.remove(structure)
+    with open(structure, 'w') as new_structure:
+        for new_chain_line in renamed_struct:
+            new_structure.write(new_chain_line)
diff --git a/peptide_docking/rescoring.py b/peptide_docking/rescoring.py
@@ -0,0 +1,44 @@
+#!/usr/bin/python
+
+import sys
+import os
+import pfpd_const as protocol
+
+
+def rescoring_flags(lowest_sc_struct):
+    with open('rescore_flags', 'w') as rescore:
+        rescore.write('-jd2:mpi_file_buf_job_distributor\n'
+                      '-in:file:silent decoys.silent\n'
+                      '-scorefile rescore.sc\n'
+                      '-out:file:silent_struct_type binary\n'
+                      '-out:file:silent decoys_rescored.silent\n'
+                      '-flexPepDocking:flexpep_score_only\n'
+                      '-use_input_sc\n'
+                      '-unboundrot {receptor}\n'
+                      '-native {nat}\n'
+                      '-mute all\n-unmute protocols.flexPepDocking'.format(nat=lowest_sc_struct, receptor=receptor))
+
+
+def rescoring():
+    with open('score.sc', 'r') as score_file:
+        scores = score_file.readlines()
+        header = scores[1].split()  # scores[0] = SEQUENCE:
+        scores = scores[2:]
+        reweighted_column = header.index('reweighted_sc')
+        description_column = header.index('description')
+
+    sorted_sc = sorted(scores, key=lambda sc_line: float(sc_line.split()[reweighted_column]))
+    lowest_sc_struct = sorted_sc[0].split()[description_column] + '.pdb'
+
+    rescoring_flags(lowest_sc_struct)
+
+    if sc_func == 'ref2015':
+        os.system(protocol.FPD.format(flags='rescore_flags'))
+    elif sc_func == 'talaris14':
+        os.system(protocol.FPD_TALARIS.format(flags='rescore_flags'))
+
+
+if __name__ == "__main__":
+    sc_func = sys.argv[1]
+    receptor = sys.argv[2]
+    rescoring()