Merge pull request #690 from haddocking/alascan2.0

alascan module
haddocking · Oct 31, 2023 · 7d2dfff · 7d2dfff
2 parents 5d84af1 + 7d5ae9a
commit 7d2dfff
Show file tree

Hide file tree

Showing 16 changed files with 1,412 additions and 19 deletions.
diff --git a/codecov.yml b/codecov.yml
@@ -0,0 +1,2 @@
+ignore:
+  - "tests"
diff --git a/examples/analysis/alascan-test.cfg b/examples/analysis/alascan-test.cfg
@@ -0,0 +1,37 @@
+# ==================================================
+#           Alanine Scan with HADDOCK3
+#
+#  This example workflow will refine a complex in 10
+#  different models, cluster them, and then do an
+#  alanine scan across the whole interface.
+# ==================================================
+
+# General parameters
+run_dir = "run1-alanine-scan"
+ncores = 10
+
+# Input
+molecules = ["../docking-protein-protein/data/e2a-hpr_1GGR.pdb"]
+
+# Workflow definition
+# ====================================================================
+[topoaa]
+autohis = true
+
+[mdref]
+# this will produce 10 refined models
+sampling_factor = 10
+
+[caprieval]
+reference_fname="../docking-protein-protein/data/e2a-hpr_1GGR.pdb"
+
+[rmsdmatrix]
+
+[clustrmsd]
+tolerance=2
+
+[alascan]
+scan_residue="ALA"
+output=true
+plot=true
+int_cutoff = 3.0
diff --git a/integration_tests/test_alascan.py b/integration_tests/test_alascan.py
@@ -0,0 +1,73 @@
+import tempfile
+from pathlib import Path
+
+import pytest
+import shutil
+import pandas as pd
+import numpy as np
+
+from haddock.modules.analysis.alascan import DEFAULT_CONFIG as DEFAULT_ALASCAN_CONFIG
+from haddock.modules.analysis.alascan import HaddockModule as AlascanModule
+from haddock.libs.libontology import PDBFile
+from . import CNS_EXEC, DATA_DIR, has_cns
+from tests import golden_data
+
+@pytest.fixture
+def alascan_module():
+    """Return a default alascan module."""
+    with tempfile.TemporaryDirectory(dir=".") as tmpdir:
+        alascan = AlascanModule(
+            order=0, path=".", initial_params=DEFAULT_ALASCAN_CONFIG
+        )
+        alascan.params["int_cutoff"] = 3.5
+        yield alascan
+
+class MockPreviousIO():
+    def __init__(self, path):
+        self.path = path
+
+    def retrieve_models(self, individualize: bool = False):
+        shutil.copy(Path(golden_data, "protprot_complex_1.pdb"), Path(".", "protprot_complex_1.pdb"))
+        shutil.copy(Path(golden_data, "protprot_complex_2.pdb"), Path(".", "protprot_complex_2.pdb"))
+        model_list = [
+            PDBFile(file_name="protprot_complex_1.pdb", path="."),
+            PDBFile(file_name="protprot_complex_2.pdb", path="."),
+        ]
+
+        return model_list
+
+    def output(self):
+        return None
+
+@has_cns
+def test_alascan_default(alascan_module, mocker):
+    """Test the alascan module."""
+    alascan_module.previous_io = MockPreviousIO(path=alascan_module.path)
+    alascan_module.run()
+
+    expected_csv1 = Path(alascan_module.path, "scan_protprot_complex_1.csv")
+    expected_csv2 = Path(alascan_module.path, "scan_protprot_complex_2.csv")
+    expected_clt_csv = Path(alascan_module.path, "scan_clt_-.csv")
+
+    assert expected_csv1.exists(), f"{expected_csv1} does not exist"
+    assert expected_csv2.exists(), f"{expected_csv2} does not exist"
+    assert expected_clt_csv.exists(), f"{expected_clt_csv} does not exist"
+
+    # check single complex csv
+    df = pd.read_csv(expected_csv1, sep="\t", comment="#")
+    assert df.shape == (10, 16), f"{expected_csv1} has wrong shape"
+    # ARG 17 B should have a delta_score approximately equal to 28.53 
+    assert np.isclose(
+        df.loc[df["ori_resname"] == "ARG"].iloc[0,:]["delta_score"],
+        28.53,
+        atol=10)
+
+    # check clt csv
+    df_clt = pd.read_csv(expected_clt_csv, sep="\t", comment="#")
+    assert df_clt.shape == (18, 11), f"{expected_clt_csv} has wrong shape"
+    # average delta score of A-38-ASP should be around 8.18
+    assert np.isclose(
+        df_clt.loc[df_clt["full_resname"] == "A-38-ASP"]["delta_score"],
+        8.18,
+        atol=2)
+
diff --git a/src/haddock/clis/cli_score.py b/src/haddock/clis/cli_score.py
@@ -36,6 +36,14 @@
 
 ap.add_argument("pdb_file", help="Input PDB file")
 
+ap.add_argument(
+    "--run_dir",
+    default="haddock-score-client",
+    type=str,
+    required=False,
+    help="Run directory name.",
+    )
+
 ap.add_argument(
     "--full",
     action="store_true",
@@ -98,13 +106,14 @@ def maincli() -> None:
 
 
 def main(
-    pdb_file: FilePath,
-    full: bool = False,
-    outputpdb: bool = False,
-    outputpsf: bool = False,
-    keep_all: bool = False,
-    **kwargs: Any,
-) -> None:
+        pdb_file: FilePath,
+        run_dir: FilePath,
+        full: bool = False,
+        outputpdb: bool = False,
+        outputpsf: bool = False,
+        keep_all: bool = False,
+        **kwargs: Any,
+        ) -> None:
     """
     Calculate the score of a complex using the ``emscoring`` module.
 
@@ -180,7 +189,7 @@ def main(
 
     print("> starting calculations...")
 
-    run_dir = Path("haddock-score-client")
+    run_dir = Path(run_dir)
     with suppress(FileNotFoundError):
         shutil.rmtree(run_dir)
     run_dir.mkdir()
@@ -241,10 +250,9 @@ def main(
         shutil.rmtree(run_dir)
     else:
         print(
-            "The folder where the calculations where performed was kept. See "
-            "folder: haddock-scoring-client"
-        )
-
+            'The folder where the calculations where performed was kept. See '
+            f'folder: {run_dir}'
+            )
 
 if __name__ == "__main__":
     sys.exit(maincli())  # type: ignore
diff --git a/src/haddock/libs/libalign.py b/src/haddock/libs/libalign.py
@@ -402,8 +402,13 @@ def centroid(X: NDFloat) -> NDFloat:
 
 
 def load_coords(
-    pdb_f, atoms, filter_resdic=None, numbering_dic=None, model2ref_chain_dict=None
-):
+        pdb_f,
+        atoms,
+        filter_resdic=None,
+        numbering_dic=None,
+        model2ref_chain_dict=None,
+        add_resname=None,
+        ):
     """
     Load coordinates from PDB.
 
@@ -420,6 +425,9 @@ def load_coords(
     numbering_dic : dict
         dict of numbering dictionaries (one dictionary per chain)
 
+    add_resname : bool
+        use the residue name in the identifier
+
     Returns
     -------
     coord_dic : dict
@@ -460,8 +468,10 @@ def load_coords(
                         #     " was not matched!"
                         #     )
                         continue
-                # identifier = f"{chain}.{resnum}.{atom_name}"
-                identifier = (chain, resnum, atom_name)
+                if add_resname is True:
+                    identifier = (chain, resnum, atom_name, resname)
+                else:
+                    identifier = (chain, resnum, atom_name)
                 if atom_name not in atoms[resname]:
                     continue
                 if chain not in chain_dic:

diff --git a/src/haddock/libs/libparallel.py b/src/haddock/libs/libparallel.py
@@ -24,6 +24,36 @@ def split_tasks(lst: Sequence[AnyT],
         yield chunk
 
 
+def get_index_list(nmodels, ncores):
+    """
+    Optimal distribution of models among cores
+    
+    Parameters
+    ----------
+    nmodels : int
+        Number of models to be distributed.
+    
+    ncores : int
+        Number of cores to be used.
+    
+    Returns
+    -------
+    index_list : list
+        List of model indexes to be used for the parallel scanning.
+    """
+    spc = nmodels // ncores
+    # now the remainder
+    rem = nmodels % ncores
+    # now the list of indexes to be used for the SCAN calculation
+    index_list = [0]
+    for core in range(ncores):
+        if core < rem:
+            index_list.append(index_list[-1] + spc + 1)
+        else:
+            index_list.append(index_list[-1] + spc)
+    return index_list
+
+
 class Worker(Process):
     """Work on tasks."""
 

diff --git a/src/haddock/libs/libplots.py b/src/haddock/libs/libplots.py
@@ -993,3 +993,86 @@ def report_generator(boxes, scatters, tables, step):
     html_report = _generate_html_report(step, figures)
     with open("report.html", "w", encoding="utf-8") as report:
         report.write(html_report)
+
+
+def make_alascan_plot(df, clt_id, scan_res="ALA"):
+    """
+    Make a plotly interactive plot.
+
+    Score components are here **weighted** by their respective
+    contribution to the total score.
+
+    Parameters
+    ----------
+    df : pandas.DataFrame
+        DataFrame containing the results of the alanine scan.
+    clt_id : int
+        Cluster ID.
+    scan_res : str, optional
+        Residue name used for the scan, by default "ALA"
+    """
+    plot_name = f"scan_clt_{clt_id}"
+    log.info(f"Generating {scan_res} scanning plot {plot_name}")
+
+    # create figure
+    fig = go.Figure(layout={"width": 2000, "height": 1000})
+    # add traces
+    fig.add_trace(
+        go.Bar(
+            x=df["full_resname"],
+            y=df["delta_score"],
+            name="delta_score",
+            )
+        )
+
+    fig.add_trace(
+        go.Bar(
+            x=df["full_resname"],
+            y=df["delta_vdw"],
+            name="delta_vdw",
+            )
+        )
+    # delta_elec is given its weight in the emscoring module
+    fig.add_trace(
+        go.Bar(
+            x=df["full_resname"],
+            y=0.2 * df["delta_elec"],
+            name="delta_elec",
+            )
+        )
+
+    fig.add_trace(
+        go.Bar(
+            x=df["full_resname"],
+            y=df["delta_desolv"],
+            name="delta_desolv",
+            )
+        )
+    # prettifying layout
+    fig.update_layout(
+        title=f"{scan_res} scanning cluster {clt_id}",
+        xaxis=dict(
+            title="Residue Name",
+            tickfont_size=14,
+            titlefont_size=16,
+            tick0=df["full_resname"],
+            # in case we want to show less residues
+            # dtick=10,
+            ),
+        yaxis=dict(
+            title="Weigted delta",
+            titlefont_size=16,
+            tickfont_size=14,
+            ),
+        legend=dict(x=1.01, y=1.0, font_family="Helvetica", font_size=16),
+        barmode="group",
+        bargap=0.05,
+        bargroupgap=0.05,
+        hovermode="x unified",
+        hoverlabel=dict(font_size=16, font_family="Helvetica"),
+        )
+    for n in range(df.shape[0] - 1):
+        fig.add_vline(x=0.5 + n, line_color="gray", opacity=0.2)
+    # save html
+    html_output_filename = f"{plot_name}.html"
+    fig.write_html(html_output_filename)
diff --git a/src/haddock/modules/analysis/__init__.py b/src/haddock/modules/analysis/__init__.py
@@ -3,7 +3,7 @@
 from typing import Iterable
 
 
-modules_using_resdic = ("caprieval", "rmsdmatrix")
+modules_using_resdic = ("caprieval", "rmsdmatrix", "alascan")
 
 
 def confirm_resdic_chainid_length(params: Iterable[str]) -> None: