Added Tree Regression Demos

Vivian7755 · Mar 5, 2019 · eac17fa · eac17fa
1 parent d985e8d
commit eac17fa
Show file tree

Hide file tree

Showing 13 changed files with 513 additions and 15 deletions.
diff --git a/examples/salsa/__init__.py b/examples/salsa/__init__.py
@@ -2,12 +2,5 @@
   Demo on the Shrunk Additive Least Squares Approximations method (SALSA) for high
   dimensional regression.
   -- [email protected]
-
-  If you use this experiment, please cite the following paper.
-    - Kandasamy K, Yu Y, "Additive Approximations in High Dimensional Nonparametric
-      Regression via the SALSA", International Conference on Machine Learning, 2016.
-    - (Dataset): Candanedo L M, Feldheim V, and Deramaix D, "Data Driven
-      Prediction Models of Energy Use of Appliances in a Low-energy House", Energy and
-      Buildings, 2017
 """
 
diff --git a/examples/salsa/salsa_energy.py b/examples/salsa/salsa_energy.py
@@ -22,5 +22,5 @@
 
 def objective(x):
   """ Objective. """
-  return salsa_compute_negative_validation_error(x, MAX_TR_DATA_SIZE)
+  return salsa_compute_negative_validation_error([MAX_TR_DATA_SIZE], x)
 
diff --git a/examples/tree_reg/README.md b/examples/tree_reg/README.md
@@ -4,13 +4,18 @@ gradient boosted regression and random forest classification.
 
 
 To run this demo, you will need to download the `news_popularity.p` and
-`naval_propulsion.p` datasets
-into this directory. The datasets are available
+`naval_propulsion.p` datasets into this directory. The datasets are available
 [here](http://www.cs.cmu.edu/~kkandasa/dragonfly_datasets.html).
-Then, run the following commands from this directory.
+To run this demo, you will need to install
+[scikit-learn](https://scikit-learn.org/stable/).
+
+
+Look at [`in_code_demo.py`](in_code_demo.py) for a demo on how to use this in your code.
+Alternatively, run the following commands from this directory for gradient boosted regression
+on the naval propulsion dataset.
 ```bash
-$ dragonfly-script.py --config config_salsa_energy.json --options ../options_files/options_example_realtime.txt
-$ dragonfly-script.py --config config_salsa_energy_mf.json --options ../options_files/options_example_realtime.txt # For multi-fidelity version
+$ dragonfly-script.py --config config_naval_gbr.json --options ../options_files/options_example_realtime.txt
+$ dragonfly-script.py --config config_naval_gbr_mf.json --options ../options_files/options_example_realtime.txt # For multi-fidelity version
 ```
 
 &nbsp;

diff --git a/examples/tree_reg/__init__.py b/examples/tree_reg/__init__.py
@@ -1,6 +1,6 @@
 """
-  A demo for fitting hyper-parameters in Tree based ensemble regression methods such as
-  gradient boosted regression and random forest classification.
+  A demo for fitting hyper-parameters in Tree based ensemble methods such as gradient
+  boosted regression/classification or random forest regression/classification.
   -- [email protected]
 """
 
diff --git a/examples/tree_reg/config_naval_gbr.json b/examples/tree_reg/config_naval_gbr.json
@@ -0,0 +1,63 @@
+{
+"name": "naval_gbr",
+
+"domain" : {
+
+  "loss" : {
+    "name":"loss",
+    "type":"discrete",
+    "items":"ls-lad-huber"
+  },
+
+  "log10_learning_rate" : {
+    "name":"log10_learning_rate",
+    "type":"float",
+    "min":-3,
+    "max":1
+  },
+
+  "n_estimators" : {
+    "name":"n_estimators",
+    "type":"int",
+    "min":1,
+    "max":1000
+  },
+
+  "subsample" : {
+    "name":"subsample",
+    "type":"float",
+    "min":0.1,
+    "max":1.0
+  },
+
+  "criterion" : {
+    "name":"criterion",
+    "type":"discrete",
+    "items":"friedman_mse-mse-mae"
+  },
+
+  "min_samples_split_frac" : {
+    "name":"min_samples_split_frac",
+    "type":"float",
+    "min":0.001,
+    "max":0.5
+  },
+
+  "min_samples_leaf_frac" : {
+    "name":"min_samples_leaf_frac",
+    "type":"float",
+    "min":0.001,
+    "max":0.5
+  },
+
+  "max_depth" : {
+    "name":"max_depth",
+    "type":"int",
+    "min":3,
+    "max":20
+  }
+
+ }
+
+}
+
diff --git a/examples/tree_reg/config_naval_gbr_mf.json b/examples/tree_reg/config_naval_gbr_mf.json
@@ -0,0 +1,74 @@
+{
+"name": "naval_gbr_mf",
+
+"domain" : {
+
+  "loss" : {
+    "name":"loss",
+    "type":"discrete",
+    "items":"ls-lad-huber"
+  },
+
+  "log10_learning_rate" : {
+    "name":"log10_learning_rate",
+    "type":"float",
+    "min":-3,
+    "max":1
+  },
+
+  "n_estimators" : {
+    "name":"n_estimators",
+    "type":"int",
+    "min":1,
+    "max":1000
+  },
+
+  "subsample" : {
+    "name":"subsample",
+    "type":"float",
+    "min":0.1,
+    "max":1.0
+  },
+
+  "criterion" : {
+    "name":"criterion",
+    "type":"discrete",
+    "items":"friedman_mse-mse-mae"
+  },
+
+  "min_samples_split_frac" : {
+    "name":"min_samples_split_frac",
+    "type":"float",
+    "min":0.001,
+    "max":0.5
+  },
+
+  "min_samples_leaf_frac" : {
+    "name":"min_samples_leaf_frac",
+    "type":"float",
+    "min":0.001,
+    "max":0.5
+  },
+
+  "max_depth" : {
+    "name":"max_depth",
+    "type":"int",
+    "min":3,
+    "max":20
+  }
+
+ },
+
+"fidel_space": {
+  "log_num_tr_data_to_use": {
+    "name":"log_num_tr_data_to_use",
+    "type":"float",
+    "min":8.0063675676502459,
+    "max":9.1049798563183568
+  }
+ },
+
+"fidel_to_opt":[9.1049798563183568]
+
+}
+
diff --git a/examples/tree_reg/config_news_rfr.json b/examples/tree_reg/config_news_rfr.json
@@ -0,0 +1,50 @@
+{
+"name": "news_rfr",
+
+"domain" : {
+
+  "n_estimators" : {
+    "name":"n_estimators",
+    "type":"int",
+    "min":1,
+    "max":1000
+  },
+
+  "criterion" : {
+    "name":"criterion",
+    "type":"discrete",
+    "items":"mse-mae"
+  },
+
+  "max_depth" : {
+    "name":"max_depth",
+    "type":"int",
+    "min":3,
+    "max":20
+  },
+
+  "min_samples_split_frac" : {
+    "name":"min_samples_split_frac",
+    "type":"float",
+    "min":0.001,
+    "max":0.5
+  },
+
+  "min_samples_leaf_frac" : {
+    "name":"min_samples_leaf_frac",
+    "type":"float",
+    "min":0.001,
+    "max":0.5
+  },
+
+  "max_features_frac" : {
+    "name":"max_features_frac",
+    "type":"float",
+    "min":0.001,
+    "max":1.0
+  }
+
+ }
+
+}
+
diff --git a/examples/tree_reg/config_news_rfr_mf.json b/examples/tree_reg/config_news_rfr_mf.json
@@ -0,0 +1,61 @@
+{
+"name": "news_rfr_mf",
+
+"domain" : {
+
+  "n_estimators" : {
+    "name":"n_estimators",
+    "type":"int",
+    "min":1,
+    "max":1000
+  },
+
+  "criterion" : {
+    "name":"criterion",
+    "type":"discrete",
+    "items":"mse-mae"
+  },
+
+  "max_depth" : {
+    "name":"max_depth",
+    "type":"int",
+    "min":3,
+    "max":20
+  },
+
+  "min_samples_split_frac" : {
+    "name":"min_samples_split_frac",
+    "type":"float",
+    "min":0.001,
+    "max":0.5
+  },
+
+  "min_samples_leaf_frac" : {
+    "name":"min_samples_leaf_frac",
+    "type":"float",
+    "min":0.001,
+    "max":0.5
+  },
+
+  "max_features_frac" : {
+    "name":"max_features_frac",
+    "type":"float",
+    "min":0.001,
+    "max":1.0
+  }
+
+ },
+
+"fidel_space": {
+  "log_num_tr_data_to_use": {
+    "name":"log_num_tr_data_to_use",
+    "type":"float",
+    "min":8.5171931914162382,
+    "max":9.9034875525361272
+  }
+ },
+
+"fidel_to_opt":[9.9034875525361272]
+
+}
+
diff --git a/examples/tree_reg/naval_gbr.py b/examples/tree_reg/naval_gbr.py
@@ -0,0 +1,15 @@
+"""
+  Tuning the hyperparameters of Gradient boosted classification on the Protein structure
+  prediction data.
+  -- [email protected]
+"""
+
+# pylint: disable=invalid-name
+
+from naval_gbr_mf import MAX_TR_DATA_SIZE
+from naval_gbr_mf import objective as objective_mf
+
+def objective(x):
+  """ Objective. """
+  return objective_mf([MAX_TR_DATA_SIZE], x)
+
diff --git a/examples/tree_reg/naval_gbr_mf.py b/examples/tree_reg/naval_gbr_mf.py
@@ -0,0 +1,42 @@
+"""
+  Tuning the hyperparameters of Gradient boosted classification on the Protein structure
+  prediction data.
+  -- [email protected]
+"""
+
+# pylint: disable=invalid-name
+# pylint: disable=unexpected-keyword-arg
+
+import pickle
+# Local
+from skltree import gbr_train_and_validate, get_tr_dataset_size_from_z0
+
+try:
+  import os
+  import sys
+  file_name = 'naval_propulsion.p'
+  curr_dir_path = os.path.dirname(os.path.realpath(__file__))
+  data_path = os.path.join(curr_dir_path, file_name)
+  if sys.version_info[0] < 3:
+    DATA = pickle.load(open(data_path, 'rb'))
+  else:
+    DATA = pickle.load(open(data_path, 'rb'), encoding='latin1')
+except IOError:
+  print(('Could not load file %s. Make sure the file %s is in the same directory as ' +
+         'this file or pass the dataset to the function.')%(file_name, data_path))
+
+MAX_TR_DATA_SIZE = 9000
+MAX_VA_DATA_SIZE = 2000
+
+
+def objective(z, x):
+  """ Objective. """
+  num_tr_data_to_use = get_tr_dataset_size_from_z0(z[0])
+  return gbr_train_and_validate(x, DATA, num_tr_data_to_use,
+                                MAX_TR_DATA_SIZE, MAX_VA_DATA_SIZE)
+
+def cost(z):
+  """ Compute cost. """
+  num_tr_data_to_use = get_tr_dataset_size_from_z0(z[0])
+  return num_tr_data_to_use / float(MAX_TR_DATA_SIZE)
+
diff --git a/examples/tree_reg/news_rfr.py b/examples/tree_reg/news_rfr.py
@@ -0,0 +1,14 @@
+"""
+  Tuning the hyperparameters of Random forest regression on the News Popularity dataset
+  -- [email protected]
+"""
+
+# pylint: disable=invalid-name
+
+from news_rfr_mf import MAX_TR_DATA_SIZE
+from news_rfr_mf import objective as objective_mf
+
+def objective(x):
+  """ Objective. """
+  return objective_mf([MAX_TR_DATA_SIZE], x)
+