Merge pull request GFNOrg#2 from bengioe/mj10/add_dock_class

Improvements
ThrunGroup · Oct 5, 2021 · 7428f38 · 7428f38
2 parents e7833bc + 6ba40e6
commit 7428f38
Show file tree

Hide file tree

Showing 5 changed files with 162 additions and 10 deletions.
diff --git a/README.md b/README.md
@@ -16,11 +16,13 @@ Additional requirements for active learning experiments:
 ## Molecule experiments
 
 Additional requirements:
-- `pandas rdkit torch_geometric h5py`
+- `pandas rdkit torch_geometric h5py ray`
 - a few biochemistry programs, see `mols/Programs/README`
 
 For `rdkit` in particular we found it to be easier to install through (mini)conda. [`torch_geometric`](https://github.com/rusty1s/pytorch_geometric) has non-trivial installation instructions.
 
+If you have CUDA 10.1 configured, you can run `pip install -r requirements.txt`. You can also change `requirements.txt` to match your CUDA version. (Replace cu101 to cuXXX, where XXX is your CUDA version).
+
 We compress the 300k molecule dataset for size. To uncompress it, run `cd mols/data/; gunzip docked_mols.h5.gz`.
 
 We omit docking routines since they are part of a separate contribution still to be submitted. These are available on demand, please do reach out to [email protected] or [email protected].
diff --git a/mols/Programs/README b/mols/Programs/README
@@ -9,4 +9,4 @@ MGL tools can be downloaded from https://ccsb.scripps.edu/mgltools/downloads/ (b
 
 AutoDock Vina can be downloaded from http://vina.scripps.edu/download.html
 
-OpenBabel can be downloaded from https://github.com/openbabel/openbabel/releases/tag/openbabel-3-1-1 (building required)
+OpenBabel can be downloaded from https://github.com/openbabel/openbabel/releases/tag/openbabel-3-1-1 (building required, make sure after the build, the bin folder is in your PATH and lib in LD_LIBRARY_PATH)
diff --git a/mols/gflownet_activelearning.py b/mols/gflownet_activelearning.py
@@ -31,7 +31,7 @@
 from torch.distributions.categorical import Categorical
 
 from utils import chem
-
+import ray
 from mol_mdp_ext import MolMDPExtended, BlockMoleculeDataExtended
 
 import model_atom, model_block, model_fingerprint
@@ -436,8 +436,28 @@ def stop_everything():
                             'test_infos': test_infos,
                             'train_infos': train_infos}
 
+@ray.remote
+class _SimDockLet:
+    def __init__(self, tmp_dir):
+        self.dock = chem.DockVina_smi(tmp_dir)
+        self.target_norm = [-8.6, 1.10]
+
+    def eval(self, mol, norm=False):
+        s = "None"
+        try:
+            s = Chem.MolToSmiles(mol.mol)
+            print("docking {}".format(s))
+            _, r, _ = self.dock.dock(s)
+        except Exception as e: # Sometimes the prediction fails
+            print('exception for', s, e)
+            r = 0
+        if not norm:
+            return r
+        reward = -(r-self.target_norm[0])/self.target_norm[1]
+        return reward
+
 
-def sample_and_update_dataset(args, model, proxy_dataset, generator_dataset, docker):
+def sample_and_update_dataset(args, model, proxy_dataset, generator_dataset, dock_pool):
     # generator_dataset.set_sampling_model(model, docker, sample_prob=args.sample_prob)
     # sampler = generator_dataset.start_samplers(8, args.num_samples)
     print("Sampling")
@@ -470,13 +490,20 @@ def sample_and_update_dataset(args, model, proxy_dataset, generator_dataset, doc
             # print('skip', mol.blockidxs, mol.jbonds)
             continue
         # print('here')
-        score = docker.eval(mol, norm=False)
-        mol.reward = proxy_dataset.r2r(score)
+        # score = docker.eval(mol, norm=False)
+        # mol.reward = proxy_dataset.r2r(score)
         # mol.smiles = s
         smis.append(mol.smiles)
-        rews.append(mol.reward)
-        print(mol.smiles, mol.reward)
+        # rews.append(mol.reward)
+        # print(mol.smiles, mol.reward)
         sampled_mols.append(mol)
+
+    t0 = time.time()
+    rews = list(dock_pool.map(lambda a, m: a.eval.remote(m), sampled_mols))
+    t1 = time.time()
+    print('Docking sim done in {}'.format(t1-t0))
+    for i in range(len(sampled_mols)):
+        sampled_mols[i].reward = rews[i]
 
     print("Computing distances")
     dists =[]
@@ -504,7 +531,9 @@ def main(args):
     reward_norm = args.reward_norm
     rews = []
     smis = []
-    docker = Docker(tmp_dir, cpu_req=args.cpu_req)
+    actors = [_SimDockLet.remote(tmp_dir)
+                    for i in range(10)]
+    pool = ray.util.ActorPool(actors)
     args.repr_type = proxy_repr_type
     args.replay_mode = "dataset"
     args.reward_exp = 1
@@ -548,7 +577,7 @@ def main(args):
 
         print(f"Sampling mols: {i}")
         # sample molecule batch for generator and update dataset with docking scores for sampled batch
-        _proxy_dataset, r, s, batch_metrics = sample_and_update_dataset(args, model, proxy_dataset, gen_model_dataset, docker)
+        _proxy_dataset, r, s, batch_metrics = sample_and_update_dataset(args, model, proxy_dataset, gen_model_dataset, pool)
         print(f"Batch Metrics: dists_mean: {batch_metrics['dists_mean']}, dists_sum: {batch_metrics['dists_sum']}, reward_mean: {batch_metrics['reward_mean']}, reward_max: {batch_metrics['reward_max']}")
         rews.append(r)
         smis.append(s)

diff --git a/mols/utils/chem.py b/mols/utils/chem.py
@@ -33,6 +33,14 @@
                   "Zr": 40, "Nb": 41, "Mo": 42, "Tc": 43, "Ru": 44, "Rh": 45, "Pd": 46, "Ag": 47, "Cd": 48, "In": 49, "Sn": 50, "Sb": 51,
                   "Te": 52, "I": 53, "Xe": 54, "Cs": 55, "Ba": 56}
 
+
+def onehot(arr, num_classes, dtype=np.int):
+    arr = np.asarray(arr, dtype=np.int)
+    assert len(arr.shape) ==1, "dims other than 1 not implemented"
+    onehot_arr = np.zeros(arr.shape + (num_classes,), dtype=dtype)
+    onehot_arr[np.arange(arr.shape[0]), arr] = 1
+    return onehot_arr
+
 def mol_from_frag(jun_bonds, frags=None, frag_smis=None, coord=None, optimize=False):
     "joins 2 or more fragments into a single molecule"
     jun_bonds = np.asarray(jun_bonds)
@@ -209,3 +217,104 @@ def mol_to_graph_backend(atmfeat, coord, bond, bondfeat, props={}, data_cls=Data
     else:
         data = data_cls(x=atmfeat, edge_index=edge_index, edge_attr=edge_attr, **props)
     return data
+
+class GenMolFile:
+    def __init__(self, outpath, mgltools, mglbin):
+        self.outpath = outpath
+        self.prepare_ligand4 = os.path.join(mgltools, "AutoDockTools/Utilities24/prepare_ligand4.py")
+        self.mglbin = mglbin
+
+        os.makedirs(os.path.join(outpath, "sdf"), exist_ok=True)
+        os.makedirs(os.path.join(outpath, "mol2"), exist_ok=True)
+        os.makedirs(os.path.join(outpath, "pdbqt"), exist_ok=True)
+
+    def __call__(self, smi, mol_name, num_conf):
+        sdf_file = os.path.join(self.outpath, "sdf", f"{mol_name}.sdf")
+        mol2_file = os.path.join(self.outpath, "mol2", f"{mol_name}.mol2")
+        pdbqt_file = os.path.join(self.outpath, "pdbqt", f"{mol_name}.pdbqt")
+
+        mol = Chem.MolFromSmiles(smi)
+        Chem.SanitizeMol(mol)
+        mol_h = Chem.AddHs(mol)
+        AllChem.EmbedMultipleConfs(mol_h, numConfs=num_conf)
+        for i in range(num_conf):
+            AllChem.MMFFOptimizeMolecule(mol_h, confId=i)
+        mp = AllChem.MMFFGetMoleculeProperties(mol_h, mmffVariant='MMFF94')
+        # choose minimum energy conformer
+        mi = np.argmin([AllChem.MMFFGetMoleculeForceField(mol_h, mp, confId=i).CalcEnergy() for i in range(num_conf)])
+        print(Chem.MolToMolBlock(mol_h, confId=int(mi)), file=open(sdf_file, 'w+'))
+        os.system(f"obabel -isdf {sdf_file} -omol2 -O {mol2_file}")
+        os.system(f"{self.mglbin}/pythonsh {self.prepare_ligand4} -l {mol2_file} -o {pdbqt_file}")
+        return pdbqt_file
+
+
+class DockVina_smi:
+    def __init__(self,
+                 outpath,
+                 mgltools_dir=os.path.join("Programs", "mgltools_x86_64Linux2_1.5.6"),
+                 vina_dir=os.path.join("Programs", "vina"),
+                 docksetup_dir=os.path.join("data", "seh/4jnc"),
+                 rec_file="4jnc.nohet.aligned.pdbqt",
+                 bindsite=(-13.4, 26.3, -13.3, 20.013, 16.3, 18.5),
+                 dock_pars="",
+                 cleanup=True):
+
+        self.outpath = outpath
+        self.mgltools = os.path.join(mgltools_dir, "MGLToolsPckgs")
+        self.mgltools_bin = os.path.join(mgltools_dir, "bin")
+        self.vina_bin = os.path.join(vina_dir, "bin/vina")
+        self.rec_file = os.path.join(docksetup_dir, rec_file)
+        self.bindsite = bindsite
+        self.dock_pars = dock_pars
+        self.cleanup = cleanup
+
+        self.gen_molfile = GenMolFile(self.outpath, self.mgltools, self.mgltools_bin)
+        # make vina command
+        self.dock_cmd = "{} --receptor {} " \
+                        "--center_x {} --center_y {} --center_z {} " \
+                        "--size_x {} --size_y {} --size_z {} "
+        self.dock_cmd = self.dock_cmd.format(self.vina_bin, self.rec_file, *self.bindsite)
+        self.dock_cmd += " --ligand {} --out {}"
+
+        os.makedirs(os.path.join(self.outpath, "docked"), exist_ok=True)
+
+    def dock(self, smi, mol_name=None, molgen_conf=20):
+        mol_name = mol_name or ''.join(random.choices(string.ascii_uppercase + string.digits, k=15))
+        docked_file = os.path.join(self.outpath, "docked", f"{mol_name}.pdb")
+        input_file = self.gen_molfile(smi, mol_name, molgen_conf)
+        # complete docking query
+        dock_cmd = self.dock_cmd.format(input_file, docked_file)
+        dock_cmd = dock_cmd + " " + self.dock_pars
+
+        # dock
+        cl = subprocess.Popen(dock_cmd, shell=True, stdout=subprocess.PIPE)
+        cl.wait()
+        # parse energy
+        with open(docked_file) as f:
+            docked_pdb = f.readlines()
+        if docked_pdb[1].startswith("REMARK VINA RESULT"):
+            dockscore = float(docked_pdb[1].split()[3])
+        else:
+            raise Exception("Can't parse docking energy")
+        # parse coordinates
+        # TODO: fix order
+        coord = []
+        endmodel_idx = 0
+        for idx, line in enumerate(docked_pdb):
+            if line.startswith("ENDMDL"):
+                endmodel_idx = idx
+                break
+
+        docked_pdb_model_1 = docked_pdb[:endmodel_idx]  # take only the model corresponding to the best energy
+        docked_pdb_model_1_atoms = [line for line in docked_pdb_model_1 if line.startswith("ATOM")
+                                    and line.split()[2][0] != 'H']  # ignore hydrogen
+        coord.append([line.split()[-7:-4] for line in docked_pdb_model_1_atoms])
+        coord = np.array(coord, dtype=np.float32)
+
+        if self.cleanup:
+            with contextlib.suppress(FileNotFoundError):
+                os.remove(os.path.join(self.outpath, "sdf", f"{mol_name}.sdf"))
+                os.remove(os.path.join(self.outpath, "mol2", f"{mol_name}.mol2"))
+                os.remove(os.path.join(self.outpath, "pdbqt", f"{mol_name}.pdbqt"))
+                os.remove(os.path.join(self.outpath, "docked", f"{mol_name}.pdb"))
+        return mol_name, dockscore, coord
diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1,12 @@
+mkl-fft==1.3.0
+mkl-random==1.2.2
+mkl-service==2.4.0
+numpy
+pandas==1.3.3
+ray==1.1.0
+rdkit-pypi
+torch==1.7.0+cu101 -f https://download.pytorch.org/whl/torch_stable.html
+torch-geometric==1.6.3
+torch-scatter==2.0.5 -f https://pytorch-geometric.com/whl/torch-1.7.0+cu101.html
+torch-sparse==0.6.8 -f https://pytorch-geometric.com/whl/torch-1.7.0+cu101.html
+tqdm==4.36.1
Original file line number	Diff line number	Diff line change
Expand Up		@@ -9,4 +9,4 @@ MGL tools can be downloaded from https://ccsb.scripps.edu/mgltools/downloads/ (b

		AutoDock Vina can be downloaded from http://vina.scripps.edu/download.html

		OpenBabel can be downloaded from https://github.com/openbabel/openbabel/releases/tag/openbabel-3-1-1 (building required)
		OpenBabel can be downloaded from https://github.com/openbabel/openbabel/releases/tag/openbabel-3-1-1 (building required, make sure after the build, the bin folder is in your PATH and lib in LD_LIBRARY_PATH)