diff --git a/README.md b/README.md index a3cf769..299a4e9 100644 --- a/README.md +++ b/README.md @@ -16,11 +16,13 @@ Additional requirements for active learning experiments: ## Molecule experiments Additional requirements: -- `pandas rdkit torch_geometric h5py` +- `pandas rdkit torch_geometric h5py ray` - a few biochemistry programs, see `mols/Programs/README` For `rdkit` in particular we found it to be easier to install through (mini)conda. [`torch_geometric`](https://github.com/rusty1s/pytorch_geometric) has non-trivial installation instructions. +If you have CUDA 10.1 configured, you can run `pip install -r requirements.txt`. You can also change `requirements.txt` to match your CUDA version. (Replace cu101 to cuXXX, where XXX is your CUDA version). + We compress the 300k molecule dataset for size. To uncompress it, run `cd mols/data/; gunzip docked_mols.h5.gz`. We omit docking routines since they are part of a separate contribution still to be submitted. These are available on demand, please do reach out to bengioe@gmail.com or mkorablyov@gmail.com. diff --git a/mols/Programs/README b/mols/Programs/README index 9d6d287..8cff6c5 100644 --- a/mols/Programs/README +++ b/mols/Programs/README @@ -9,4 +9,4 @@ MGL tools can be downloaded from https://ccsb.scripps.edu/mgltools/downloads/ (b AutoDock Vina can be downloaded from http://vina.scripps.edu/download.html -OpenBabel can be downloaded from https://github.com/openbabel/openbabel/releases/tag/openbabel-3-1-1 (building required) +OpenBabel can be downloaded from https://github.com/openbabel/openbabel/releases/tag/openbabel-3-1-1 (building required, make sure after the build, the bin folder is in your PATH and lib in LD_LIBRARY_PATH) diff --git a/mols/gflownet_activelearning.py b/mols/gflownet_activelearning.py index 988deb6..f8f8193 100644 --- a/mols/gflownet_activelearning.py +++ b/mols/gflownet_activelearning.py @@ -31,7 +31,7 @@ from torch.distributions.categorical import Categorical from utils import chem - +import ray from mol_mdp_ext import MolMDPExtended, BlockMoleculeDataExtended import model_atom, model_block, model_fingerprint @@ -436,8 +436,28 @@ def stop_everything(): 'test_infos': test_infos, 'train_infos': train_infos} +@ray.remote +class _SimDockLet: + def __init__(self, tmp_dir): + self.dock = chem.DockVina_smi(tmp_dir) + self.target_norm = [-8.6, 1.10] + + def eval(self, mol, norm=False): + s = "None" + try: + s = Chem.MolToSmiles(mol.mol) + print("docking {}".format(s)) + _, r, _ = self.dock.dock(s) + except Exception as e: # Sometimes the prediction fails + print('exception for', s, e) + r = 0 + if not norm: + return r + reward = -(r-self.target_norm[0])/self.target_norm[1] + return reward + -def sample_and_update_dataset(args, model, proxy_dataset, generator_dataset, docker): +def sample_and_update_dataset(args, model, proxy_dataset, generator_dataset, dock_pool): # generator_dataset.set_sampling_model(model, docker, sample_prob=args.sample_prob) # sampler = generator_dataset.start_samplers(8, args.num_samples) print("Sampling") @@ -470,13 +490,20 @@ def sample_and_update_dataset(args, model, proxy_dataset, generator_dataset, doc # print('skip', mol.blockidxs, mol.jbonds) continue # print('here') - score = docker.eval(mol, norm=False) - mol.reward = proxy_dataset.r2r(score) + # score = docker.eval(mol, norm=False) + # mol.reward = proxy_dataset.r2r(score) # mol.smiles = s smis.append(mol.smiles) - rews.append(mol.reward) - print(mol.smiles, mol.reward) + # rews.append(mol.reward) + # print(mol.smiles, mol.reward) sampled_mols.append(mol) + + t0 = time.time() + rews = list(dock_pool.map(lambda a, m: a.eval.remote(m), sampled_mols)) + t1 = time.time() + print('Docking sim done in {}'.format(t1-t0)) + for i in range(len(sampled_mols)): + sampled_mols[i].reward = rews[i] print("Computing distances") dists =[] @@ -504,7 +531,9 @@ def main(args): reward_norm = args.reward_norm rews = [] smis = [] - docker = Docker(tmp_dir, cpu_req=args.cpu_req) + actors = [_SimDockLet.remote(tmp_dir) + for i in range(10)] + pool = ray.util.ActorPool(actors) args.repr_type = proxy_repr_type args.replay_mode = "dataset" args.reward_exp = 1 @@ -548,7 +577,7 @@ def main(args): print(f"Sampling mols: {i}") # sample molecule batch for generator and update dataset with docking scores for sampled batch - _proxy_dataset, r, s, batch_metrics = sample_and_update_dataset(args, model, proxy_dataset, gen_model_dataset, docker) + _proxy_dataset, r, s, batch_metrics = sample_and_update_dataset(args, model, proxy_dataset, gen_model_dataset, pool) print(f"Batch Metrics: dists_mean: {batch_metrics['dists_mean']}, dists_sum: {batch_metrics['dists_sum']}, reward_mean: {batch_metrics['reward_mean']}, reward_max: {batch_metrics['reward_max']}") rews.append(r) smis.append(s) diff --git a/mols/utils/chem.py b/mols/utils/chem.py index 99b252f..c630105 100644 --- a/mols/utils/chem.py +++ b/mols/utils/chem.py @@ -33,6 +33,14 @@ "Zr": 40, "Nb": 41, "Mo": 42, "Tc": 43, "Ru": 44, "Rh": 45, "Pd": 46, "Ag": 47, "Cd": 48, "In": 49, "Sn": 50, "Sb": 51, "Te": 52, "I": 53, "Xe": 54, "Cs": 55, "Ba": 56} + +def onehot(arr, num_classes, dtype=np.int): + arr = np.asarray(arr, dtype=np.int) + assert len(arr.shape) ==1, "dims other than 1 not implemented" + onehot_arr = np.zeros(arr.shape + (num_classes,), dtype=dtype) + onehot_arr[np.arange(arr.shape[0]), arr] = 1 + return onehot_arr + def mol_from_frag(jun_bonds, frags=None, frag_smis=None, coord=None, optimize=False): "joins 2 or more fragments into a single molecule" jun_bonds = np.asarray(jun_bonds) @@ -209,3 +217,104 @@ def mol_to_graph_backend(atmfeat, coord, bond, bondfeat, props={}, data_cls=Data else: data = data_cls(x=atmfeat, edge_index=edge_index, edge_attr=edge_attr, **props) return data + +class GenMolFile: + def __init__(self, outpath, mgltools, mglbin): + self.outpath = outpath + self.prepare_ligand4 = os.path.join(mgltools, "AutoDockTools/Utilities24/prepare_ligand4.py") + self.mglbin = mglbin + + os.makedirs(os.path.join(outpath, "sdf"), exist_ok=True) + os.makedirs(os.path.join(outpath, "mol2"), exist_ok=True) + os.makedirs(os.path.join(outpath, "pdbqt"), exist_ok=True) + + def __call__(self, smi, mol_name, num_conf): + sdf_file = os.path.join(self.outpath, "sdf", f"{mol_name}.sdf") + mol2_file = os.path.join(self.outpath, "mol2", f"{mol_name}.mol2") + pdbqt_file = os.path.join(self.outpath, "pdbqt", f"{mol_name}.pdbqt") + + mol = Chem.MolFromSmiles(smi) + Chem.SanitizeMol(mol) + mol_h = Chem.AddHs(mol) + AllChem.EmbedMultipleConfs(mol_h, numConfs=num_conf) + for i in range(num_conf): + AllChem.MMFFOptimizeMolecule(mol_h, confId=i) + mp = AllChem.MMFFGetMoleculeProperties(mol_h, mmffVariant='MMFF94') + # choose minimum energy conformer + mi = np.argmin([AllChem.MMFFGetMoleculeForceField(mol_h, mp, confId=i).CalcEnergy() for i in range(num_conf)]) + print(Chem.MolToMolBlock(mol_h, confId=int(mi)), file=open(sdf_file, 'w+')) + os.system(f"obabel -isdf {sdf_file} -omol2 -O {mol2_file}") + os.system(f"{self.mglbin}/pythonsh {self.prepare_ligand4} -l {mol2_file} -o {pdbqt_file}") + return pdbqt_file + + +class DockVina_smi: + def __init__(self, + outpath, + mgltools_dir=os.path.join("Programs", "mgltools_x86_64Linux2_1.5.6"), + vina_dir=os.path.join("Programs", "vina"), + docksetup_dir=os.path.join("data", "seh/4jnc"), + rec_file="4jnc.nohet.aligned.pdbqt", + bindsite=(-13.4, 26.3, -13.3, 20.013, 16.3, 18.5), + dock_pars="", + cleanup=True): + + self.outpath = outpath + self.mgltools = os.path.join(mgltools_dir, "MGLToolsPckgs") + self.mgltools_bin = os.path.join(mgltools_dir, "bin") + self.vina_bin = os.path.join(vina_dir, "bin/vina") + self.rec_file = os.path.join(docksetup_dir, rec_file) + self.bindsite = bindsite + self.dock_pars = dock_pars + self.cleanup = cleanup + + self.gen_molfile = GenMolFile(self.outpath, self.mgltools, self.mgltools_bin) + # make vina command + self.dock_cmd = "{} --receptor {} " \ + "--center_x {} --center_y {} --center_z {} " \ + "--size_x {} --size_y {} --size_z {} " + self.dock_cmd = self.dock_cmd.format(self.vina_bin, self.rec_file, *self.bindsite) + self.dock_cmd += " --ligand {} --out {}" + + os.makedirs(os.path.join(self.outpath, "docked"), exist_ok=True) + + def dock(self, smi, mol_name=None, molgen_conf=20): + mol_name = mol_name or ''.join(random.choices(string.ascii_uppercase + string.digits, k=15)) + docked_file = os.path.join(self.outpath, "docked", f"{mol_name}.pdb") + input_file = self.gen_molfile(smi, mol_name, molgen_conf) + # complete docking query + dock_cmd = self.dock_cmd.format(input_file, docked_file) + dock_cmd = dock_cmd + " " + self.dock_pars + + # dock + cl = subprocess.Popen(dock_cmd, shell=True, stdout=subprocess.PIPE) + cl.wait() + # parse energy + with open(docked_file) as f: + docked_pdb = f.readlines() + if docked_pdb[1].startswith("REMARK VINA RESULT"): + dockscore = float(docked_pdb[1].split()[3]) + else: + raise Exception("Can't parse docking energy") + # parse coordinates + # TODO: fix order + coord = [] + endmodel_idx = 0 + for idx, line in enumerate(docked_pdb): + if line.startswith("ENDMDL"): + endmodel_idx = idx + break + + docked_pdb_model_1 = docked_pdb[:endmodel_idx] # take only the model corresponding to the best energy + docked_pdb_model_1_atoms = [line for line in docked_pdb_model_1 if line.startswith("ATOM") + and line.split()[2][0] != 'H'] # ignore hydrogen + coord.append([line.split()[-7:-4] for line in docked_pdb_model_1_atoms]) + coord = np.array(coord, dtype=np.float32) + + if self.cleanup: + with contextlib.suppress(FileNotFoundError): + os.remove(os.path.join(self.outpath, "sdf", f"{mol_name}.sdf")) + os.remove(os.path.join(self.outpath, "mol2", f"{mol_name}.mol2")) + os.remove(os.path.join(self.outpath, "pdbqt", f"{mol_name}.pdbqt")) + os.remove(os.path.join(self.outpath, "docked", f"{mol_name}.pdb")) + return mol_name, dockscore, coord diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..e3b8395 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,12 @@ +mkl-fft==1.3.0 +mkl-random==1.2.2 +mkl-service==2.4.0 +numpy +pandas==1.3.3 +ray==1.1.0 +rdkit-pypi +torch==1.7.0+cu101 -f https://download.pytorch.org/whl/torch_stable.html +torch-geometric==1.6.3 +torch-scatter==2.0.5 -f https://pytorch-geometric.com/whl/torch-1.7.0+cu101.html +torch-sparse==0.6.8 -f https://pytorch-geometric.com/whl/torch-1.7.0+cu101.html +tqdm==4.36.1 \ No newline at end of file