Symbolic Regression/Classification C/C++ (rapidsai#3638)

This PR contains the implementation of the core algorithms of gplearn(tournaments + mutations + program evaluations) in cuml. Tagging all involved: @teju85 @venkywonka @vinaydes The goal is to complete the following tasks: - [x] Implement program execution and metric evaluation for a given dataset on the GPU - [x] Implement a batched version of the above for all programs in a generation - [x] Run tournaments for program selection on the GPU - [x] Perform all mutations on the CPU - [x] Fit, Predict and Transform functions for api - [x] Tests for all individual functions - [x] Add an example demonstrating how to perform symbolic regression (a similar approach can be taken for transformation too) Authors: - Vimarsh Sathia (https://github.com/vimarsh6739) - Venkat (https://github.com/venkywonka) Approvers: - Robert Maynard (https://github.com/robertmaynard) - Venkat (https://github.com/venkywonka) - Thejaswi. N. S (https://github.com/teju85) - Corey J. Nolet (https://github.com/cjnolet) - Tamas Bela Feher (https://github.com/tfeher) - Dante Gama Dessavre (https://github.com/dantegd) URL: rapidsai#3638
vimarsh6739 · Nov 15, 2021 · eab7dee · eab7dee
1 parent 48d1107
commit eab7dee
Show file tree

Hide file tree

Showing 22 changed files with 3,714 additions and 163 deletions.
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
@@ -268,6 +268,7 @@ if(BUILD_CUML_CPP_LIBRARY)
         src/fil/infer.cu
         src/glm/glm.cu
         src/genetic/genetic.cu
+        src/genetic/program.cu
         src/genetic/node.cu
         src/hdbscan/hdbscan.cu
         src/hdbscan/condensed_hierarchy.cu

diff --git a/cpp/examples/CMakeLists.txt b/cpp/examples/CMakeLists.txt
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2019, NVIDIA CORPORATION.
+# Copyright (c) 2019-2021, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -16,3 +16,4 @@
 
 add_subdirectory(kmeans)
 add_subdirectory(dbscan)
+add_subdirectory(symreg)
diff --git a/cpp/examples/symreg/CMakeLists.txt b/cpp/examples/symreg/CMakeLists.txt
@@ -0,0 +1,19 @@
+#=============================================================================
+# Copyright (c) 2021, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#=============================================================================
+
+add_executable(symreg_example symreg_example.cpp)
+target_include_directories(symreg_example PRIVATE ${CUML_INCLUDE_DIRECTORIES})
+target_link_libraries(symreg_example cuml++)
diff --git a/cpp/examples/symreg/CMakeLists_standalone.txt b/cpp/examples/symreg/CMakeLists_standalone.txt
@@ -0,0 +1,33 @@
+#
+# Copyright (c) 2021, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+cmake_minimum_required(VERSION 3.8 FATAL_ERROR)
+include(ExternalProject)
+
+project(symreg_example VERSION 0.1.0 LANGUAGES CXX CUDA )
+
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+
+find_package(CUDAToolkit)
+find_package(cuml)
+
+add_executable(symreg_example symreg_example.cpp)
+
+# Need to set linker language to CUDA to link the CUDA Runtime
+set_target_properties(symreg_example PROPERTIES LINKER_LANGUAGE "CUDA")
+
+# Link cuml and cudart
+target_link_libraries(symreg_example cuml::cuml++ CUDA::cudart)
diff --git a/cpp/examples/symreg/README.md b/cpp/examples/symreg/README.md
@@ -0,0 +1,87 @@
+# symbolic regression
+This subfolder contains an example on how perform symbolic regression in cuML (from C++) 
+There are two `CMakeLists.txt` in this folder:
+1. `CMakeLists.txt` (default) which is included when building cuML
+2. `CMakeLists_standalone.txt` as an example for a stand alone project linking to `libcuml.so`
+
+## Build
+`symreg_example` is built as a part of cuML. To build it as a standalone executable, do
+```bash
+$ cmake .. -DCUML_LIBRARY_DIR=/path/to/directory/with/libcuml.so -DCUML_INCLUDE_DIR=/path/to/cuml/headers
+```
+Then build with `make` or `ninja`
+```
+$ make  
+Scanning dependencies of target raft
+[ 10%] Creating directories for 'raft'
+[ 20%] Performing download step (git clone) for 'raft'
+Cloning into 'raft'...
+[ 30%] Performing update step for 'raft'
+[ 40%] No patch step for 'raft'
+[ 50%] No configure step for 'raft'
+[ 60%] No build step for 'raft'
+[ 70%] No install step for 'raft'
+[ 80%] Completed 'raft'
+[ 80%] Built target raft
+Scanning dependencies of target symreg_example
+[ 90%] Building CXX object CMakeFiles/symreg_example.dir/symreg_example.cpp.o
+[100%] Linking CUDA executable symreg_example
+[100%] Built target symreg_example
+```
+`CMakeLists_standalone.txt` also loads a minimal set of header dependencies(namely [raft](https://github.com/rapidsai/raft) and [cub](https://github.com/NVIDIA/cub)) if they are not detected in the system. 
+## Run
+
+1. Generate a toy training and test dataset
+```
+$ python prepare_input.py
+Training set has n_rows=250 n_cols=2
+Test set has n_rows=50 n_cols=2
+Wrote 500 values to train_data.txt
+Wrote 100 values to test_data.txt
+Wrote 250 values to train_labels.txt
+Wrote 50 values to test_labels.txt
+```
+
+2. Run the symbolic regressor using the 4 files as inputs. An example query is given below
+```bash
+$ ./symreg_example -n_cols 2                   \
+                   -n_train_rows 250           \
+                   -n_test_rows 50             \
+                   -random_state 21            \
+                   -population_size 4000       \
+                   -generations 20             \
+                   -stopping_criteria 0.01     \
+                   -p_crossover 0.7            \
+                   -p_subtree 0.1              \
+                   -p_hoist 0.05               \ 
+                   -p_point 0.1                \
+                   -parsimony_coefficient 0.01
+```
+
+3. The corresponding output for the above query is given below :
+
+```
+Reading input with 250 rows and 2 columns from train_data.txt.
+Reading input with 250 rows from train_labels.txt.
+Reading input with 50 rows and 2 columns from test_data.txt.
+Reading input with 50 rows from test_labels.txt.
+***************************************
+Allocating device memory...
+Allocation time =   0.259072ms
+***************************************
+Beginning training on given dataset...
+Finished training for 4 generations.
+              Best AST index :      1855
+              Best AST depth :         3
+             Best AST length :        13
+           Best AST equation :( add( sub( mult( X0, X0) , div( X1, X1) ) , sub( X1, mult( X1, X1) ) ) )
+Training time =    626.658ms
+***************************************
+Beginning Inference on Test dataset... 
+Inference score on test set = 5.29271e-08
+Inference time =    0.35248ms
+Some Predicted test values:
+-1.65061;-1.64081;-0.91711;-2.28976;-0.280688;
+Corresponding Actual test values:
+-1.65061;-1.64081;-0.91711;-2.28976;-0.280688;
+```
diff --git a/cpp/examples/symreg/prepare_input.py b/cpp/examples/symreg/prepare_input.py
@@ -0,0 +1,46 @@
+# Copyright (c) 2021, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import numpy as np
+from sklearn.model_selection import train_test_split
+
+rng = np.random.RandomState(seed=2021)
+
+# Training samples
+X_train = rng.uniform(-1, 1, 500).reshape(250, 2)
+y_train = X_train[:, 0]**2 - X_train[:, 1]**2 + X_train[:, 1] - 1
+
+# Testing samples
+X_test = rng.uniform(-1, 1, 100).reshape(50, 2)
+y_test = X_test[:, 0]**2 - X_test[:, 1]**2 + X_test[:, 1] - 1
+
+print("Training set has n_rows=%d n_cols=%d" %(X_train.shape))
+print("Test set has n_rows=%d n_cols=%d" %(X_test.shape))
+
+train_data    = "train_data.txt"
+test_data     = "test_data.txt"
+train_labels  = "train_labels.txt"
+test_labels   = "test_labels.txt"
+
+# Save all datasets in col-major format
+np.savetxt(train_data, X_train.T,fmt='%.7f')
+np.savetxt(test_data, X_test.T,fmt='%.7f')
+np.savetxt(train_labels, y_train,fmt='%.7f')
+np.savetxt(test_labels, y_test,fmt='%.7f')
+
+print("Wrote %d values to %s"%(X_train.size,train_data))
+print("Wrote %d values to %s"%(X_test.size,test_data))
+print("Wrote %d values to %s"%(y_train.size,train_labels))
+print("Wrote %d values to %s"%(y_test.size,test_labels))