Skip to content

Commit

Permalink
[Dataset] Add Datasets BBBP, ToxCast, SIDER, ClinTox (#86)
Browse files Browse the repository at this point in the history
* Fix

* Update

* Fix

* Add MUV

* Add HIV

* Add BACE

* Update
  • Loading branch information
mufeili authored Aug 30, 2020
1 parent b014174 commit 32b14bd
Show file tree
Hide file tree
Showing 10 changed files with 629 additions and 18 deletions.
28 changes: 28 additions & 0 deletions docs/source/api/data.rst
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,34 @@ BACE
:members: __getitem__, __len__, task_pos_weights
:show-inheritance:

BBBP
````

.. autoclass:: dgllife.data.BBBP
:members: __getitem__, __len__, task_pos_weights
:show-inheritance:

ToxCast
```````

.. autoclass:: dgllife.data.ToxCast
:members: __getitem__, __len__, task_pos_weights
:show-inheritance:

SIDER
`````

.. autoclass:: dgllife.data.SIDER
:members: __getitem__, __len__, task_pos_weights
:show-inheritance:

ClinTox
```````

.. autoclass:: dgllife.data.ClinTox
:members: __getitem__, __len__, task_pos_weights
:show-inheritance:

Experimental solubility determined at AstraZeneca, extracted from ChEMBL
````````````````````````````````````````````````````````````````````````

Expand Down
4 changes: 4 additions & 0 deletions python/dgllife/data/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,4 +17,8 @@
from .muv import *
from .hiv import *
from .bace import *
from .bbbp import *
from .toxcast import *
from .sider import *
from .clintox import *
from .uspto import *
155 changes: 155 additions & 0 deletions python/dgllife/data/bbbp.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,155 @@
# -*- coding: utf-8 -*-
#
# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
# SPDX-License-Identifier: Apache-2.0
#
# BBBP from MoleculeNet for the prediction of permeability properties.

import pandas as pd

from dgl.data.utils import get_download_dir, download, _get_dgl_url, extract_archive

from .csv_dataset import MoleculeCSVDataset
from ..utils.mol_to_graph import smiles_to_bigraph

__all__ = ['BBBP']

class BBBP(MoleculeCSVDataset):
r"""BBBP from MoleculeNet for the prediction of permeability properties
The Blood-brain barrier penetration (BBBP) dataset comes from a study on
the modeling and prediction of the barrier permeability. As a membrane
separating circulating blood and brain extracellular fluid, the blood-brain
barrier blocks most drugs, hormones and neurotransmitters. Thus penetration of
the barrier forms a long-standing issue in development of drugs targeting
central nervous system. This dataset includes binary labels for over 2000
compounds on their permeability properties.
References:
* [1] MoleculeNet: A Benchmark for Molecular Machine Learning.
* [2] A Bayesian approach to in silico blood-brain barrier penetration modeling
Parameters
----------
smiles_to_graph: callable, str -> DGLGraph
A function turning a SMILES string into a DGLGraph.
Default to :func:`dgllife.utils.smiles_to_bigraph`.
node_featurizer : callable, rdkit.Chem.rdchem.Mol -> dict
Featurization for nodes like atoms in a molecule, which can be used to update
ndata for a DGLGraph. Default to None.
edge_featurizer : callable, rdkit.Chem.rdchem.Mol -> dict
Featurization for edges like bonds in a molecule, which can be used to update
edata for a DGLGraph. Default to None.
load : bool
Whether to load the previously pre-processed dataset or pre-process from scratch.
``load`` should be False when we want to try different graph construction and
featurization methods and need to preprocess from scratch. Default to False.
log_every : bool
Print a message every time ``log_every`` molecules are processed. Default to 1000.
cache_file_path : str
Path to the cached DGLGraphs, default to 'bbbp_dglgraph.bin'.
n_jobs : int
The maximum number of concurrently running jobs for graph construction and featurization,
using joblib backend. Default to 1.
Examples
--------
>>> import torch
>>> from dgllife.data import BBBP
>>> from dgllife.utils import smiles_to_bigraph, CanonicalAtomFeaturizer
>>> dataset = BBBP(smiles_to_bigraph, CanonicalAtomFeaturizer())
>>> # Get size of the dataset
>>> len(dataset)
2050
>>> # Get the 0th datapoint, consisting of SMILES, DGLGraph, labels, and masks
>>> dataset[0]
('[Cl].CC(C)NCC(O)COc1cccc2ccccc12',
Graph(num_nodes=20, num_edges=40,
ndata_schemes={'h': Scheme(shape=(74,), dtype=torch.float32)}
edata_schemes={}),
tensor([1.]),
tensor([1.]))
The dataset instance also contains information about compound name.
>>> dataset.names[i]
We can also get the name along with SMILES, DGLGraph, labels, and masks at once.
>>> dataset.load_full = True
>>> dataset[0]
('[Cl].CC(C)NCC(O)COc1cccc2ccccc12',
Graph(num_nodes=20, num_edges=40,
ndata_schemes={'h': Scheme(shape=(74,), dtype=torch.float32)}
edata_schemes={}),
tensor([1.]),
tensor([1.]),
'Propanolol')
To address the imbalance between positive and negative samples, we can re-weight
positive samples for each task based on the training datapoints.
>>> train_ids = torch.arange(500)
>>> dataset.task_pos_weights(train_ids)
tensor([0.7123])
"""
def __init__(self,
smiles_to_graph=smiles_to_bigraph,
node_featurizer=None,
edge_featurizer=None,
load=False,
log_every=1000,
cache_file_path='./bbbp_dglgraph.bin',
n_jobs=1):

self._url = 'dataset/bbbp.zip'
data_path = get_download_dir() + '/bbbp.zip'
dir_path = get_download_dir() + '/bbbp'
download(_get_dgl_url(self._url), path=data_path, overwrite=False)
extract_archive(data_path, dir_path)
df = pd.read_csv(dir_path + '/BBBP.csv')

self.names = df['name'].tolist()
self.load_full = False

super(BBBP, self).__init__(df=df,
smiles_to_graph=smiles_to_graph,
node_featurizer=node_featurizer,
edge_featurizer=edge_featurizer,
smiles_column='smiles',
cache_file_path=cache_file_path,
task_names=['p_np'],
load=load,
log_every=log_every,
init_mask=True,
n_jobs=n_jobs)

def __getitem__(self, item):
"""Get datapoint with index
Parameters
----------
item : int
Datapoint index
Returns
-------
str
SMILES for the ith datapoint
DGLGraph
DGLGraph for the ith datapoint
Tensor of dtype float32 and shape (T)
Labels of the ith datapoint for all tasks. T for the number of tasks.
Tensor of dtype float32 and shape (T)
Binary masks of the ith datapoint indicating the existence of labels for all tasks.
str, optional
Name for the ith compound, returned only when ``self.load_full`` is True.
"""
if self.load_full:
return self.smiles[item], self.graphs[item], self.labels[item], \
self.mask[item], self.names[item]
else:
return self.smiles[item], self.graphs[item], self.labels[item], self.mask[item]
130 changes: 130 additions & 0 deletions python/dgllife/data/clintox.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
# -*- coding: utf-8 -*-
#
# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
# SPDX-License-Identifier: Apache-2.0
#
# ClinTox from MoleculeNet for the prediction of clinical trial toxicity
# (or absence of toxicity) and FDA approval status

import pandas as pd

from dgl.data.utils import get_download_dir, download, _get_dgl_url, extract_archive

from .csv_dataset import MoleculeCSVDataset
from ..utils.mol_to_graph import smiles_to_bigraph

__all__ = ['ClinTox']

class ClinTox(MoleculeCSVDataset):
r"""ClinTox from MoleculeNet for the prediction of clinical trial toxicity
(or absence of toxicity) and FDA approval status
The ClinTox dataset compares drugs approved by the FDA and drugs that have failed
clinical trials for toxicity reasons. The dataset includes two classification
tasks for 1491 drug compounds with known chemical structures: (1) clinical trial
toxicity (or absence of toxicity) and (2) FDA approval status. The MoleculeNet
benchmark compiles the list of FDA-approved drugs from the SWEETLEAD database and
the list of drugs that failed clinical trials for toxicity reasons from the
Aggregate Analysis of ClinicalTrials.gov (AACT) database.
References:
* [1] MoleculeNet: A Benchmark for Molecular Machine Learning.
Parameters
----------
smiles_to_graph: callable, str -> DGLGraph
A function turning a SMILES string into a DGLGraph.
Default to :func:`dgllife.utils.smiles_to_bigraph`.
node_featurizer : callable, rdkit.Chem.rdchem.Mol -> dict
Featurization for nodes like atoms in a molecule, which can be used to update
ndata for a DGLGraph. Default to None.
edge_featurizer : callable, rdkit.Chem.rdchem.Mol -> dict
Featurization for edges like bonds in a molecule, which can be used to update
edata for a DGLGraph. Default to None.
load : bool
Whether to load the previously pre-processed dataset or pre-process from scratch.
``load`` should be False when we want to try different graph construction and
featurization methods and need to preprocess from scratch. Default to False.
log_every : bool
Print a message every time ``log_every`` molecules are processed. Default to 1000.
cache_file_path : str
Path to the cached DGLGraphs, default to 'clintox_dglgraph.bin'.
n_jobs : int
The maximum number of concurrently running jobs for graph construction and featurization,
using joblib backend. Default to 1.
Examples
--------
>>> import torch
>>> from dgllife.data import ClinTox
>>> from dgllife.utils import smiles_to_bigraph, CanonicalAtomFeaturizer
>>> dataset = ClinTox(smiles_to_bigraph, CanonicalAtomFeaturizer())
>>> # Get size of the dataset
>>> len(dataset)
1484
>>> # Get the 0th datapoint, consisting of SMILES, DGLGraph, labels, and masks
>>> dataset[0]
('*C(=O)[C@H](CCCCNC(=O)OCCOC)NC(=O)OCCOC',
Graph(num_nodes=24, num_edges=46,
ndata_schemes={'h': Scheme(shape=(74,), dtype=torch.float32)}
edata_schemes={}),
tensor([1., 0.]),
tensor([1., 1.]))
To address the imbalance between positive and negative samples, we can re-weight
positive samples for each task based on the training datapoints.
>>> train_ids = torch.arange(500)
>>> dataset.task_pos_weights(train_ids)
tensor([ 0.0684, 10.9048])
"""
def __init__(self,
smiles_to_graph=smiles_to_bigraph,
node_featurizer=None,
edge_featurizer=None,
load=False,
log_every=1000,
cache_file_path='./clintox_dglgraph.bin',
n_jobs=1):

self._url = 'dataset/clintox.zip'
data_path = get_download_dir() + '/clintox.zip'
dir_path = get_download_dir() + '/clintox'
download(_get_dgl_url(self._url), path=data_path, overwrite=False)
extract_archive(data_path, dir_path)
df = pd.read_csv(dir_path + '/clintox.csv')

super(ClinTox, self).__init__(df=df,
smiles_to_graph=smiles_to_graph,
node_featurizer=node_featurizer,
edge_featurizer=edge_featurizer,
smiles_column='smiles',
cache_file_path=cache_file_path,
load=load,
log_every=log_every,
init_mask=True,
n_jobs=n_jobs)

def __getitem__(self, item):
"""Get datapoint with index
Parameters
----------
item : int
Datapoint index
Returns
-------
str
SMILES for the ith datapoint
DGLGraph
DGLGraph for the ith datapoint
Tensor of dtype float32 and shape (T)
Labels of the ith datapoint for all tasks. T for the number of tasks.
Tensor of dtype float32 and shape (T)
Binary masks of the ith datapoint indicating the existence of labels for all tasks.
"""
return self.smiles[item], self.graphs[item], self.labels[item], self.mask[item]
10 changes: 7 additions & 3 deletions python/dgllife/data/csv_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,13 +121,17 @@ def _pre_process(self, smiles_to_graph, node_featurizer,
self.graphs = pmap(smiles_to_graph,
self.smiles,
node_featurizer=node_featurizer,
edge_featurizer=edge_featurizer)
edge_featurizer=edge_featurizer,
n_jobs=n_jobs)
self.graphs = [g for g in self.graphs if g is not None]
else:
for i, s in enumerate(self.smiles):
if (i + 1) % log_every == 0:
print('Processing molecule {:d}/{:d}'.format(i+1, len(self)))
self.graphs.append(smiles_to_graph(s, node_featurizer=node_featurizer,
edge_featurizer=edge_featurizer))
g = smiles_to_graph(s, node_featurizer=node_featurizer,
edge_featurizer=edge_featurizer)
if g is not None:
self.graphs.append(g)
_label_values = self.df[self.task_names].values
# np.nan_to_num will also turn inf into a very large number
self.labels = F.zerocopy_from_numpy(np.nan_to_num(_label_values).astype(np.float32))
Expand Down
Loading

0 comments on commit 32b14bd

Please sign in to comment.