Skip to content

Commit

Permalink
Symbolic Regression/Classification C/C++ (rapidsai#3638)
Browse files Browse the repository at this point in the history
This PR contains the implementation of the core algorithms of gplearn(tournaments + mutations + program evaluations) in cuml. 
Tagging all involved: @teju85 @venkywonka @vinaydes 

The goal is to complete the following tasks:

- [x] Implement program execution and metric evaluation for a given dataset on the GPU
- [x] Implement a batched version of the above for all programs in a generation
- [x] Run tournaments for program selection on the GPU
- [x] Perform all mutations on the CPU
- [x] Fit, Predict and Transform functions for api
- [x] Tests for all individual functions  
- [x] Add an example demonstrating how to perform symbolic regression (a similar approach can be taken for transformation too)

Authors:
  - Vimarsh Sathia (https://github.com/vimarsh6739)
  - Venkat (https://github.com/venkywonka)

Approvers:
  - Robert Maynard (https://github.com/robertmaynard)
  - Venkat (https://github.com/venkywonka)
  - Thejaswi. N. S (https://github.com/teju85)
  - Corey J. Nolet (https://github.com/cjnolet)
  - Tamas Bela Feher (https://github.com/tfeher)
  - Dante Gama Dessavre (https://github.com/dantegd)

URL: rapidsai#3638
  • Loading branch information
vimarsh6739 authored Nov 15, 2021
1 parent 48d1107 commit eab7dee
Show file tree
Hide file tree
Showing 22 changed files with 3,714 additions and 163 deletions.
1 change: 1 addition & 0 deletions cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -268,6 +268,7 @@ if(BUILD_CUML_CPP_LIBRARY)
src/fil/infer.cu
src/glm/glm.cu
src/genetic/genetic.cu
src/genetic/program.cu
src/genetic/node.cu
src/hdbscan/hdbscan.cu
src/hdbscan/condensed_hierarchy.cu
Expand Down
3 changes: 2 additions & 1 deletion cpp/examples/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#
# Copyright (c) 2019, NVIDIA CORPORATION.
# Copyright (c) 2019-2021, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand All @@ -16,3 +16,4 @@

add_subdirectory(kmeans)
add_subdirectory(dbscan)
add_subdirectory(symreg)
19 changes: 19 additions & 0 deletions cpp/examples/symreg/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
#=============================================================================
# Copyright (c) 2021, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#=============================================================================

add_executable(symreg_example symreg_example.cpp)
target_include_directories(symreg_example PRIVATE ${CUML_INCLUDE_DIRECTORIES})
target_link_libraries(symreg_example cuml++)
33 changes: 33 additions & 0 deletions cpp/examples/symreg/CMakeLists_standalone.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
#
# Copyright (c) 2021, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
cmake_minimum_required(VERSION 3.8 FATAL_ERROR)
include(ExternalProject)

project(symreg_example VERSION 0.1.0 LANGUAGES CXX CUDA )

set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_STANDARD_REQUIRED ON)

find_package(CUDAToolkit)
find_package(cuml)

add_executable(symreg_example symreg_example.cpp)

# Need to set linker language to CUDA to link the CUDA Runtime
set_target_properties(symreg_example PROPERTIES LINKER_LANGUAGE "CUDA")

# Link cuml and cudart
target_link_libraries(symreg_example cuml::cuml++ CUDA::cudart)
87 changes: 87 additions & 0 deletions cpp/examples/symreg/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
# symbolic regression
This subfolder contains an example on how perform symbolic regression in cuML (from C++)
There are two `CMakeLists.txt` in this folder:
1. `CMakeLists.txt` (default) which is included when building cuML
2. `CMakeLists_standalone.txt` as an example for a stand alone project linking to `libcuml.so`

## Build
`symreg_example` is built as a part of cuML. To build it as a standalone executable, do
```bash
$ cmake .. -DCUML_LIBRARY_DIR=/path/to/directory/with/libcuml.so -DCUML_INCLUDE_DIR=/path/to/cuml/headers
```
Then build with `make` or `ninja`
```
$ make
Scanning dependencies of target raft
[ 10%] Creating directories for 'raft'
[ 20%] Performing download step (git clone) for 'raft'
Cloning into 'raft'...
[ 30%] Performing update step for 'raft'
[ 40%] No patch step for 'raft'
[ 50%] No configure step for 'raft'
[ 60%] No build step for 'raft'
[ 70%] No install step for 'raft'
[ 80%] Completed 'raft'
[ 80%] Built target raft
Scanning dependencies of target symreg_example
[ 90%] Building CXX object CMakeFiles/symreg_example.dir/symreg_example.cpp.o
[100%] Linking CUDA executable symreg_example
[100%] Built target symreg_example
```
`CMakeLists_standalone.txt` also loads a minimal set of header dependencies(namely [raft](https://github.com/rapidsai/raft) and [cub](https://github.com/NVIDIA/cub)) if they are not detected in the system.
## Run

1. Generate a toy training and test dataset
```
$ python prepare_input.py
Training set has n_rows=250 n_cols=2
Test set has n_rows=50 n_cols=2
Wrote 500 values to train_data.txt
Wrote 100 values to test_data.txt
Wrote 250 values to train_labels.txt
Wrote 50 values to test_labels.txt
```

2. Run the symbolic regressor using the 4 files as inputs. An example query is given below
```bash
$ ./symreg_example -n_cols 2 \
-n_train_rows 250 \
-n_test_rows 50 \
-random_state 21 \
-population_size 4000 \
-generations 20 \
-stopping_criteria 0.01 \
-p_crossover 0.7 \
-p_subtree 0.1 \
-p_hoist 0.05 \
-p_point 0.1 \
-parsimony_coefficient 0.01
```

3. The corresponding output for the above query is given below :

```
Reading input with 250 rows and 2 columns from train_data.txt.
Reading input with 250 rows from train_labels.txt.
Reading input with 50 rows and 2 columns from test_data.txt.
Reading input with 50 rows from test_labels.txt.
***************************************
Allocating device memory...
Allocation time = 0.259072ms
***************************************
Beginning training on given dataset...
Finished training for 4 generations.
Best AST index : 1855
Best AST depth : 3
Best AST length : 13
Best AST equation :( add( sub( mult( X0, X0) , div( X1, X1) ) , sub( X1, mult( X1, X1) ) ) )
Training time = 626.658ms
***************************************
Beginning Inference on Test dataset...
Inference score on test set = 5.29271e-08
Inference time = 0.35248ms
Some Predicted test values:
-1.65061;-1.64081;-0.91711;-2.28976;-0.280688;
Corresponding Actual test values:
-1.65061;-1.64081;-0.91711;-2.28976;-0.280688;
```
46 changes: 46 additions & 0 deletions cpp/examples/symreg/prepare_input.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
# Copyright (c) 2021, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

import numpy as np
from sklearn.model_selection import train_test_split

rng = np.random.RandomState(seed=2021)

# Training samples
X_train = rng.uniform(-1, 1, 500).reshape(250, 2)
y_train = X_train[:, 0]**2 - X_train[:, 1]**2 + X_train[:, 1] - 1

# Testing samples
X_test = rng.uniform(-1, 1, 100).reshape(50, 2)
y_test = X_test[:, 0]**2 - X_test[:, 1]**2 + X_test[:, 1] - 1

print("Training set has n_rows=%d n_cols=%d" %(X_train.shape))
print("Test set has n_rows=%d n_cols=%d" %(X_test.shape))

train_data = "train_data.txt"
test_data = "test_data.txt"
train_labels = "train_labels.txt"
test_labels = "test_labels.txt"

# Save all datasets in col-major format
np.savetxt(train_data, X_train.T,fmt='%.7f')
np.savetxt(test_data, X_test.T,fmt='%.7f')
np.savetxt(train_labels, y_train,fmt='%.7f')
np.savetxt(test_labels, y_test,fmt='%.7f')

print("Wrote %d values to %s"%(X_train.size,train_data))
print("Wrote %d values to %s"%(X_test.size,test_data))
print("Wrote %d values to %s"%(y_train.size,train_labels))
print("Wrote %d values to %s"%(y_test.size,test_labels))
Loading

0 comments on commit eab7dee

Please sign in to comment.