Skip to content

Commit

Permalink
genetic programming initial structures (#3387)
Browse files Browse the repository at this point in the history
This PR introduces/proposes some of the basic and core (gpu-friendly!) data structures for implementing gplearn in cuML in order to address the issue #2121 .

Tagging all who will be involved in this development: @vinaydes @venkywonka @vimarsh6739.

PS: It also contains an experimental register-based stack implementation that will be useful while implementing CUDA-based AST evaluation, which is needed for organizing tournaments.

Authors:
  - Thejaswi. N. S (@teju85)

Approvers:
  - Corey J. Nolet (@cjnolet)

URL: #3387
  • Loading branch information
teju85 authored Feb 3, 2021
1 parent a3c62b1 commit 9196252
Show file tree
Hide file tree
Showing 12 changed files with 864 additions and 1 deletion.
2 changes: 2 additions & 0 deletions cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -400,6 +400,8 @@ if(BUILD_CUML_CPP_LIBRARY)
src/fil/fil.cu
src/fil/infer.cu
src/glm/glm.cu
src/genetic/genetic.cu
src/genetic/node.cu
src/holtwinters/holtwinters.cu
src/kmeans/kmeans.cu
src/knn/knn.cu
Expand Down
130 changes: 130 additions & 0 deletions cpp/include/cuml/genetic/genetic.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
/*
* Copyright (c) 2020-2021, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#pragma once

#include "node.h"
#include "program.h"

#include <cstdint>
#include <string>
#include <vector>

namespace cuml {
namespace genetic {

/** Type of initialization of the member programs in the population */
enum class init_method_t : uint32_t {
/** random nodes chosen, allowing shorter or asymmetrical trees */
grow,
/** growing till a randomly chosen depth */
full,
/** 50% of the population on `grow` and the rest with `full` */
half_and_half,
}; // enum class init_method_t

/** fitness metric types */
enum class metric_t : uint32_t {
/** mean absolute error (regression-only) */
mae,
/** mean squared error (regression-only) */
mse,
/** root mean squared error (regression-only) */
rmse,
/** pearson product-moment coefficient (regression and transformation) */
pearson,
/** spearman's rank-order coefficient (regression and transformation) */
spearman,
/** binary cross-entropy loss (classification-only) */
logloss,
}; // enum class metric_t

enum class transformer_t : uint32_t {
/** sigmoid function */
sigmoid,
}; // enum class transformer_t

/**
* @brief contains all the hyper-parameters for training
*
* @note Unless otherwise mentioned, all the parameters below are applicable to
* all of classification, regression and transformation.
*/
struct param {
/** number of programs in each generation */
int population_size = 1000;
/**
* number of fittest programs to compare during correlation
* (transformation-only)
*/
int hall_of_fame = 100;
/**
* number of fittest programs to return from `hall_of_fame` top programs
* (transformation-only)
*/
int n_components = 10;
/** number of generations to evolve */
int generations = 20;
/**
* number of programs that compete in the tournament to become part of next
* generation
*/
int tournament_size = 20;
/** metric threshold used for early stopping */
float stopping_criteria = 0.0f;
/** minimum/maximum value for `constant` nodes */
float const_range[2] = {-1.0f, 1.0f};
/** minimum/maximum depth of programs after initialization */
int init_depth[2] = {2, 6};
/** initialization method */
init_method_t init_method = init_method_t::half_and_half;
/** list of functions to choose from */
std::vector<node::type> function_set{node::type::add, node::type::mul,
node::type::div, node::type::sub};
/** transformation function to class probabilities (classification-only) */
transformer_t transformer = transformer_t::sigmoid;
/** fitness metric */
metric_t metric = metric_t::mae;
/** penalization factor for large programs */
float parsimony_coefficient = 0.001f;
/** crossover mutation probability of the tournament winner */
float p_crossover = 0.9f;
/** subtree mutation probability of the tournament winner*/
float p_subtree_mutation = 0.01f;
/** hoist mutation probability of the tournament winner */
float p_hoist_mutation = 0.01f;
/** point mutation probabiilty of the tournament winner */
float p_point_mutation = 0.01f;
/** point replace probabiility for point mutations */
float p_point_replace = 0.05f;
/** subsampling factor */
float max_samples = 1.0f;
/** list of feature names for generating syntax trees from the programs */
std::vector<std::string> feature_names;
///@todo: feature_names
///@todo: verbose
/** random seed used for RNG */
uint64_t random_state = 0ull;

/** Computes the probability of 'reproduction' */
float p_reproduce() const;

/** maximum possible number of programs */
int max_programs() const;
}; // struct param

} // namespace genetic
} // namespace cuml
162 changes: 162 additions & 0 deletions cpp/include/cuml/genetic/node.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,162 @@
/*
* Copyright (c) 2020-2021, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#pragma once

#include <cstdint>
#include <string>

namespace cuml {
namespace genetic {

/**
* @brief Represents a node in the syntax tree.
*
* @code{.cpp}
* // A non-terminal (aka function) node
* node func_node{node::type::sub};
* // A constant node
* float const_value = 2.f;
* node const_node{const_value};
* // A variable (aka feature) node
* node var_node{20};
* @endcode
*/
struct node {
/**
* @brief All possible types of nodes. For simplicity, all the terminal and
* non-terminal types are clubbed together
*/
enum class type : uint32_t {
variable = 0,
constant,

// note: keep the case statements in alphabetical order under each category
// of operators.
functions_begin,
// different binary function types follow
binary_begin = functions_begin,
add = binary_begin,
atan2,
div,
fdim,
max,
min,
mul,
pow,
sub,
binary_end = sub, // keep this to be the last binary function in the list
// different unary function types follow
unary_begin,
abs = unary_begin,
acos,
acosh,
asin,
asinh,
atan,
atanh,
cbrt,
cos,
cosh,
cube,
exp,
inv,
log,
neg,
rcbrt,
rsqrt,
sin,
sinh,
sq,
sqrt,
tan,
tanh,
unary_end = tanh, // keep this to be the last unary function in the list
functions_end = unary_end,
}; // enum type

/**
* @brief Construct a function node
*
* @param[in] ft function type
*/
explicit node(type ft);

/**
* @brief Construct a variable node
*
* @param[in] fid feature id that represents the variable
*/
explicit node(int fid);

/**
* @brief Construct a constant node
*
* @param[in] val constant value
*/
explicit node(float val);

/**
* @param[in] src source node to be copied
*/
explicit node(const node& src);

/**
* @brief assignment operator
*
* @param[in] src source node to be copied
*
* @return current node reference
*/
node& operator=(const node& src);

/** whether the current is either a variable or a constant */
bool is_terminal() const;

/** whether the current node is a function */
bool is_nonterminal() const;

/** Get the arity of the node. If it is a terminal, then a 0 is returned */
int arity() const;

/**
* @brief Helper method to get node type from input string
*
* @param[in] ntype node type in string. Possible strings correlate one-to-one
* with the enum values for `type`
*
* @return `type`
*/
static type from_str(const std::string& ntype);

/** constant used to represent invalid feature id */
static const int kInvalidFeatureId;

/** node type */
type t;
union {
/**
* if the node is `variable` type, then this is the column id to be used to
* fetch its value, from the input dataset
*/
int fid;
/** if the node is `constant` type, then this is the value of the node */
float val;
} u;
}; // struct node

} // namespace genetic
} // namespace cuml
47 changes: 47 additions & 0 deletions cpp/include/cuml/genetic/program.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
/*
* Copyright (c) 2021, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#pragma once

#include "node.h"

namespace cuml {
namespace genetic {

/**
* @brief The main data structure to store the AST that represents a program
* in the current generation
*/
struct program {
/**
* the AST. It is stored in the reverse of DFS-right-child-first order. In
* other words, construct a regular AST in the form of depth-first, but
* instead of storing the left child first, store the right child and so on.
* Now take the resulting 1D array and reverse it.
*
* @note The pointed memory buffer is NOT owned by this class and further it
* is assumed to be a zero-copy (aka pinned memory) buffer, atleast in
* this initial version
*/
node* nodes;
/** total number of nodes in this AST */
int len;
/** maximum depth of this AST */
int depth;
}; // struct program

} // namespace genetic
} // namespace cuml
26 changes: 26 additions & 0 deletions cpp/src/genetic/genetic.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
/*
* Copyright (c) 2020-2021, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include "genetic.cuh"
namespace cuml {
namespace genetic {

float param::p_reproduce() const { return detail::p_reproduce(*this); }

int param::max_programs() const { return detail::max_programs(*this); }

} // namespace genetic
} // namespace cuml
Loading

0 comments on commit 9196252

Please sign in to comment.