Pre-trained Models on MoleculeNet (#104)

* Fix * Fix * Fix * Fix * Fix * Fix * Fix * Fix * Fix * Fix * Update * Fix * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Fix * Update * Update * Update * Update * Update * Update * Fix * Fix * Fix * Fix * Fix * Update * Fix * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Fix * Update * Update * Update * Fix * Update * Update * Update * Update * Update * Update * Fix * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Fix * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Fix * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Fix * Update * Update * Update * Fix * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Updateg * Update
awslabs · Oct 28, 2020 · e4a8408 · e4a8408
1 parent ecd95c9
commit e4a8408
Showing 195 changed files with 5,924 additions and 480 deletions.
diff --git a/examples/property_prediction/csv_data_configuration/README.md b/examples/property_prediction/csv_data_configuration/README.md
@@ -3,6 +3,8 @@
 The scripts in this directory are helpful for quickly prototyping a GNN-based model for molecular property 
 prediction on a new CSV dataset.
 
+The command line interface has been tested against the MoleculeNet benchmark. For more details, see [here](../moleculenet).
+
 ## Data Preparation
 
 For training, we assume that the molecular properties are recorded in a CSV file, where one column holds the SMILES strings 
@@ -86,10 +88,12 @@ Other optional arguments include:
     we assume all columns are molecular properties except for the SMILES column.
 - **Take the logarithm of the labels** `-lv` [default=False]
     - Whether to take logarithm of the labels for modeling
-- **Split**: `-s split` [default=scaffold]
+- **Split**: `-s split` [default=scaffold_smiles]
     - Specifies the split for the dataset
-    - By default we use `'scaffold'` for dataset splitting based on Bemis-Murcko scaffolds, alternatively we can 
-      use `'random'` for random split.
+    - By default we use `'scaffold_smiles'` for scaffold split based on 
+      `rdkit.Chem.Scaffolds.MurckoScaffold.MurckoScaffoldSmiles`, alternatively we can 
+      use `'random'` for random split or `'scaffold_decompose'` for scaffold split based on 
+      `rdkit.Chem.AllChem.MurckoDecompose`.
 - **Split Ratio**: `-sr a,b,c` [default=0.8,0.1,0.1]
     - Specifies the proportion of the dataset to be used for training, validation and test. 
 - **Evaluation Metric**: `-me metric` [default=r2]
@@ -179,10 +183,12 @@ Other optional arguments include:
 - **Task**: `-t task1,task2,task3,...` 
     - Specifies the headers for task columns in the CSV file. If not specified, 
     we assume all columns are molecular properties except for the SMILES column.
-- **Split**: `-s split` [default=scaffold]
+- **Split**: `-s split` [default=scaffold_smiles]
     - Specifies the split for the dataset
-    - By default we use `'scaffold'` for dataset splitting based on Bemis-Murcko scaffolds, alternatively we can 
-      use `'random'` for random split.
+    - By default we use `'scaffold_smiles'` for scaffold split based on 
+      `rdkit.Chem.Scaffolds.MurckoScaffold.MurckoScaffoldSmiles`, alternatively we can 
+      use `'random'` for random split or `'scaffold_decompose'` for scaffold split based on 
+      `rdkit.Chem.AllChem.MurckoDecompose`.
 - **Split Ratio**: `-sr a,b,c` [default=0.8,0.1,0.1]
     - Specifies the proportion of the dataset to be used for training, validation and test. 
 - **Evaluation Metric**: `-me metric` [default=roc_auc_score]

diff --git a/examples/property_prediction/csv_data_configuration/classification_train.py b/examples/property_prediction/csv_data_configuration/classification_train.py
@@ -82,9 +82,9 @@ def main(args, exp_config, train_set, val_set, test_set):
     loss_criterion = nn.BCEWithLogitsLoss(reduction='none')
     optimizer = Adam(model.parameters(), lr=exp_config['lr'],
                      weight_decay=exp_config['weight_decay'])
-    stopper = EarlyStopping(mode=args['early_stop_mode'],
-                            patience=exp_config['patience'],
-                            filename=args['trial_path'] + '/model.pth')
+    stopper = EarlyStopping(patience=exp_config['patience'],
+                            filename=args['trial_path'] + '/model.pth',
+                            metric=args['metric'])
 
     for epoch in range(args['num_epochs']):
         # Train
@@ -151,10 +151,16 @@ def objective(hyperparams):
                         help='Header for the tasks to model. If None, we will model '
                              'all the columns except for the smiles_column in the CSV file. '
                              '(default: None)')
-    parser.add_argument('-s', '--split', choices=['scaffold', 'random'], default='scaffold',
-                        help='Dataset splitting method (default: scaffold)')
+    parser.add_argument('-s', '--split',
+                        choices=['scaffold_decompose', 'scaffold_smiles', 'random'],
+                        default='scaffold_smiles',
+                        help='Dataset splitting method (default: scaffold_smiles). For scaffold '
+                             'split based on rdkit.Chem.AllChem.MurckoDecompose, '
+                             'use scaffold_decompose. For scaffold split based on '
+                             'rdkit.Chem.Scaffolds.MurckoScaffold.MurckoScaffoldSmiles, '
+                             'use scaffold_smiles.')
     parser.add_argument('-sr', '--split-ratio', default='0.8,0.1,0.1', type=str,
-                        help='Proportion of the dataset used for training, validation and test, '
+                        help='Proportion of the dataset to use for training, validation and test, '
                              '(default: 0.8,0.1,0.1)')
     parser.add_argument('-me', '--metric', choices=['roc_auc_score', 'pr_auc_score'],
                         default='roc_auc_score',
@@ -193,9 +199,6 @@ def objective(hyperparams):
     if args['task_names'] is not None:
         args['task_names'] = args['task_names'].split(',')
 
-    if args['metric'] in ['roc_auc_score', 'pr_auc_score']:
-        args['early_stop_mode'] = 'higher'
-
     args = init_featurizer(args)
     df = pd.read_csv(args['csv_path'])
     mkdir_p(args['result_path'])

diff --git a/examples/property_prediction/csv_data_configuration/regression_train.py b/examples/property_prediction/csv_data_configuration/regression_train.py
@@ -39,9 +39,9 @@ def run_a_train_epoch(args, epoch, model, data_loader, loss_criterion, optimizer
         if batch_id % args['print_every'] == 0:
             print('epoch {:d}/{:d}, batch {:d}/{:d}, loss {:.4f}'.format(
                 epoch + 1, args['num_epochs'], batch_id + 1, len(data_loader), loss.item()))
-    total_score = np.mean(train_meter.compute_metric(args['metric']))
+    train_score = np.mean(train_meter.compute_metric(args['metric']))
     print('epoch {:d}/{:d}, training {} {:.4f}'.format(
-        epoch + 1, args['num_epochs'], args['metric'], total_score))
+        epoch + 1, args['num_epochs'], args['metric'], train_score))
 
 def run_an_eval_epoch(args, model, data_loader):
     model.eval()
@@ -52,8 +52,7 @@ def run_an_eval_epoch(args, model, data_loader):
             labels = labels.to(args['device'])
             prediction = predict(args, model, bg)
             eval_meter.update(prediction, labels, masks)
-        total_score = np.mean(eval_meter.compute_metric(args['metric']))
-    return total_score
+    return np.mean(eval_meter.compute_metric(args['metric']))
 
 def main(args, exp_config, train_set, val_set, test_set):
     # Record settings
@@ -82,9 +81,9 @@ def main(args, exp_config, train_set, val_set, test_set):
     loss_criterion = nn.SmoothL1Loss(reduction='none')
     optimizer = Adam(model.parameters(), lr=exp_config['lr'],
                      weight_decay=exp_config['weight_decay'])
-    stopper = EarlyStopping(mode=args['early_stop_mode'],
-                            patience=exp_config['patience'],
-                            filename=args['trial_path'] + '/model.pth')
+    stopper = EarlyStopping(patience=exp_config['patience'],
+                            filename=args['trial_path'] + '/model.pth',
+                            metric=args['metric'])
 
     for epoch in range(args['num_epochs']):
         # Train
@@ -153,10 +152,16 @@ def objective(hyperparams):
                         help='Header for the tasks to model. If None, we will model '
                              'all the columns except for the smiles_column in the CSV file. '
                              '(default: None)')
-    parser.add_argument('-s', '--split', choices=['scaffold', 'random'], default='scaffold',
-                        help='Dataset splitting method (default: scaffold)')
+    parser.add_argument('-s', '--split',
+                        choices=['scaffold_decompose', 'scaffold_smiles', 'random'],
+                        default='scaffold_smiles',
+                        help='Dataset splitting method (default: scaffold_smiles). For scaffold '
+                             'split based on rdkit.Chem.AllChem.MurckoDecompose, '
+                             'use scaffold_decompose. For scaffold split based on '
+                             'rdkit.Chem.Scaffolds.MurckoScaffold.MurckoScaffoldSmiles, '
+                             'use scaffold_smiles.')
     parser.add_argument('-sr', '--split-ratio', default='0.8,0.1,0.1', type=str,
-                        help='Proportion of the dataset used for training, validation and test '
+                        help='Proportion of the dataset to use for training, validation and test '
                              '(default: 0.8,0.1,0.1)')
     parser.add_argument('-me', '--metric', choices=['r2', 'mae', 'rmse'], default='r2',
                         help='Metric for evaluation (default: r2)')
@@ -194,11 +199,6 @@ def objective(hyperparams):
     if args['task_names'] is not None:
         args['task_names'] = args['task_names'].split(',')
 
-    if args['metric'] == 'r2':
-        args['early_stop_mode'] = 'higher'
-    else:
-        args['early_stop_mode'] = 'lower'
-
     args = init_featurizer(args)
     df = pd.read_csv(args['csv_path'])
     mkdir_p(args['result_path'])

diff --git a/examples/property_prediction/csv_data_configuration/utils.py b/examples/property_prediction/csv_data_configuration/utils.py
@@ -149,9 +149,14 @@ def split_dataset(args, dataset):
         Test subset
     """
     train_ratio, val_ratio, test_ratio = map(float, args['split_ratio'].split(','))
-    if args['split'] == 'scaffold':
+    if args['split'] == 'scaffold_decompose':
         train_set, val_set, test_set = ScaffoldSplitter.train_val_test_split(
-            dataset, frac_train=train_ratio, frac_val=val_ratio, frac_test=test_ratio)
+            dataset, frac_train=train_ratio, frac_val=val_ratio, frac_test=test_ratio,
+            scaffold_func='decompose')
+    elif args['split'] == 'scaffold_smiles':
+        train_set, val_set, test_set = ScaffoldSplitter.train_val_test_split(
+            dataset, frac_train=train_ratio, frac_val=val_ratio, frac_test=test_ratio,
+            scaffold_func='smiles')
     elif args['split'] == 'random':
         train_set, val_set, test_set = RandomSplitter.train_val_test_split(
             dataset, frac_train=train_ratio, frac_val=val_ratio, frac_test=test_ratio)