-
Notifications
You must be signed in to change notification settings - Fork 540
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
genetic programming initial structures (#3387)
This PR introduces/proposes some of the basic and core (gpu-friendly!) data structures for implementing gplearn in cuML in order to address the issue #2121 . Tagging all who will be involved in this development: @vinaydes @venkywonka @vimarsh6739. PS: It also contains an experimental register-based stack implementation that will be useful while implementing CUDA-based AST evaluation, which is needed for organizing tournaments. Authors: - Thejaswi. N. S (@teju85) Approvers: - Corey J. Nolet (@cjnolet) URL: #3387
- Loading branch information
Showing
12 changed files
with
864 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,130 @@ | ||
/* | ||
* Copyright (c) 2020-2021, NVIDIA CORPORATION. | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
|
||
#pragma once | ||
|
||
#include "node.h" | ||
#include "program.h" | ||
|
||
#include <cstdint> | ||
#include <string> | ||
#include <vector> | ||
|
||
namespace cuml { | ||
namespace genetic { | ||
|
||
/** Type of initialization of the member programs in the population */ | ||
enum class init_method_t : uint32_t { | ||
/** random nodes chosen, allowing shorter or asymmetrical trees */ | ||
grow, | ||
/** growing till a randomly chosen depth */ | ||
full, | ||
/** 50% of the population on `grow` and the rest with `full` */ | ||
half_and_half, | ||
}; // enum class init_method_t | ||
|
||
/** fitness metric types */ | ||
enum class metric_t : uint32_t { | ||
/** mean absolute error (regression-only) */ | ||
mae, | ||
/** mean squared error (regression-only) */ | ||
mse, | ||
/** root mean squared error (regression-only) */ | ||
rmse, | ||
/** pearson product-moment coefficient (regression and transformation) */ | ||
pearson, | ||
/** spearman's rank-order coefficient (regression and transformation) */ | ||
spearman, | ||
/** binary cross-entropy loss (classification-only) */ | ||
logloss, | ||
}; // enum class metric_t | ||
|
||
enum class transformer_t : uint32_t { | ||
/** sigmoid function */ | ||
sigmoid, | ||
}; // enum class transformer_t | ||
|
||
/** | ||
* @brief contains all the hyper-parameters for training | ||
* | ||
* @note Unless otherwise mentioned, all the parameters below are applicable to | ||
* all of classification, regression and transformation. | ||
*/ | ||
struct param { | ||
/** number of programs in each generation */ | ||
int population_size = 1000; | ||
/** | ||
* number of fittest programs to compare during correlation | ||
* (transformation-only) | ||
*/ | ||
int hall_of_fame = 100; | ||
/** | ||
* number of fittest programs to return from `hall_of_fame` top programs | ||
* (transformation-only) | ||
*/ | ||
int n_components = 10; | ||
/** number of generations to evolve */ | ||
int generations = 20; | ||
/** | ||
* number of programs that compete in the tournament to become part of next | ||
* generation | ||
*/ | ||
int tournament_size = 20; | ||
/** metric threshold used for early stopping */ | ||
float stopping_criteria = 0.0f; | ||
/** minimum/maximum value for `constant` nodes */ | ||
float const_range[2] = {-1.0f, 1.0f}; | ||
/** minimum/maximum depth of programs after initialization */ | ||
int init_depth[2] = {2, 6}; | ||
/** initialization method */ | ||
init_method_t init_method = init_method_t::half_and_half; | ||
/** list of functions to choose from */ | ||
std::vector<node::type> function_set{node::type::add, node::type::mul, | ||
node::type::div, node::type::sub}; | ||
/** transformation function to class probabilities (classification-only) */ | ||
transformer_t transformer = transformer_t::sigmoid; | ||
/** fitness metric */ | ||
metric_t metric = metric_t::mae; | ||
/** penalization factor for large programs */ | ||
float parsimony_coefficient = 0.001f; | ||
/** crossover mutation probability of the tournament winner */ | ||
float p_crossover = 0.9f; | ||
/** subtree mutation probability of the tournament winner*/ | ||
float p_subtree_mutation = 0.01f; | ||
/** hoist mutation probability of the tournament winner */ | ||
float p_hoist_mutation = 0.01f; | ||
/** point mutation probabiilty of the tournament winner */ | ||
float p_point_mutation = 0.01f; | ||
/** point replace probabiility for point mutations */ | ||
float p_point_replace = 0.05f; | ||
/** subsampling factor */ | ||
float max_samples = 1.0f; | ||
/** list of feature names for generating syntax trees from the programs */ | ||
std::vector<std::string> feature_names; | ||
///@todo: feature_names | ||
///@todo: verbose | ||
/** random seed used for RNG */ | ||
uint64_t random_state = 0ull; | ||
|
||
/** Computes the probability of 'reproduction' */ | ||
float p_reproduce() const; | ||
|
||
/** maximum possible number of programs */ | ||
int max_programs() const; | ||
}; // struct param | ||
|
||
} // namespace genetic | ||
} // namespace cuml |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,162 @@ | ||
/* | ||
* Copyright (c) 2020-2021, NVIDIA CORPORATION. | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
|
||
#pragma once | ||
|
||
#include <cstdint> | ||
#include <string> | ||
|
||
namespace cuml { | ||
namespace genetic { | ||
|
||
/** | ||
* @brief Represents a node in the syntax tree. | ||
* | ||
* @code{.cpp} | ||
* // A non-terminal (aka function) node | ||
* node func_node{node::type::sub}; | ||
* // A constant node | ||
* float const_value = 2.f; | ||
* node const_node{const_value}; | ||
* // A variable (aka feature) node | ||
* node var_node{20}; | ||
* @endcode | ||
*/ | ||
struct node { | ||
/** | ||
* @brief All possible types of nodes. For simplicity, all the terminal and | ||
* non-terminal types are clubbed together | ||
*/ | ||
enum class type : uint32_t { | ||
variable = 0, | ||
constant, | ||
|
||
// note: keep the case statements in alphabetical order under each category | ||
// of operators. | ||
functions_begin, | ||
// different binary function types follow | ||
binary_begin = functions_begin, | ||
add = binary_begin, | ||
atan2, | ||
div, | ||
fdim, | ||
max, | ||
min, | ||
mul, | ||
pow, | ||
sub, | ||
binary_end = sub, // keep this to be the last binary function in the list | ||
// different unary function types follow | ||
unary_begin, | ||
abs = unary_begin, | ||
acos, | ||
acosh, | ||
asin, | ||
asinh, | ||
atan, | ||
atanh, | ||
cbrt, | ||
cos, | ||
cosh, | ||
cube, | ||
exp, | ||
inv, | ||
log, | ||
neg, | ||
rcbrt, | ||
rsqrt, | ||
sin, | ||
sinh, | ||
sq, | ||
sqrt, | ||
tan, | ||
tanh, | ||
unary_end = tanh, // keep this to be the last unary function in the list | ||
functions_end = unary_end, | ||
}; // enum type | ||
|
||
/** | ||
* @brief Construct a function node | ||
* | ||
* @param[in] ft function type | ||
*/ | ||
explicit node(type ft); | ||
|
||
/** | ||
* @brief Construct a variable node | ||
* | ||
* @param[in] fid feature id that represents the variable | ||
*/ | ||
explicit node(int fid); | ||
|
||
/** | ||
* @brief Construct a constant node | ||
* | ||
* @param[in] val constant value | ||
*/ | ||
explicit node(float val); | ||
|
||
/** | ||
* @param[in] src source node to be copied | ||
*/ | ||
explicit node(const node& src); | ||
|
||
/** | ||
* @brief assignment operator | ||
* | ||
* @param[in] src source node to be copied | ||
* | ||
* @return current node reference | ||
*/ | ||
node& operator=(const node& src); | ||
|
||
/** whether the current is either a variable or a constant */ | ||
bool is_terminal() const; | ||
|
||
/** whether the current node is a function */ | ||
bool is_nonterminal() const; | ||
|
||
/** Get the arity of the node. If it is a terminal, then a 0 is returned */ | ||
int arity() const; | ||
|
||
/** | ||
* @brief Helper method to get node type from input string | ||
* | ||
* @param[in] ntype node type in string. Possible strings correlate one-to-one | ||
* with the enum values for `type` | ||
* | ||
* @return `type` | ||
*/ | ||
static type from_str(const std::string& ntype); | ||
|
||
/** constant used to represent invalid feature id */ | ||
static const int kInvalidFeatureId; | ||
|
||
/** node type */ | ||
type t; | ||
union { | ||
/** | ||
* if the node is `variable` type, then this is the column id to be used to | ||
* fetch its value, from the input dataset | ||
*/ | ||
int fid; | ||
/** if the node is `constant` type, then this is the value of the node */ | ||
float val; | ||
} u; | ||
}; // struct node | ||
|
||
} // namespace genetic | ||
} // namespace cuml |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,47 @@ | ||
/* | ||
* Copyright (c) 2021, NVIDIA CORPORATION. | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
|
||
#pragma once | ||
|
||
#include "node.h" | ||
|
||
namespace cuml { | ||
namespace genetic { | ||
|
||
/** | ||
* @brief The main data structure to store the AST that represents a program | ||
* in the current generation | ||
*/ | ||
struct program { | ||
/** | ||
* the AST. It is stored in the reverse of DFS-right-child-first order. In | ||
* other words, construct a regular AST in the form of depth-first, but | ||
* instead of storing the left child first, store the right child and so on. | ||
* Now take the resulting 1D array and reverse it. | ||
* | ||
* @note The pointed memory buffer is NOT owned by this class and further it | ||
* is assumed to be a zero-copy (aka pinned memory) buffer, atleast in | ||
* this initial version | ||
*/ | ||
node* nodes; | ||
/** total number of nodes in this AST */ | ||
int len; | ||
/** maximum depth of this AST */ | ||
int depth; | ||
}; // struct program | ||
|
||
} // namespace genetic | ||
} // namespace cuml |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,26 @@ | ||
/* | ||
* Copyright (c) 2020-2021, NVIDIA CORPORATION. | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
|
||
#include "genetic.cuh" | ||
namespace cuml { | ||
namespace genetic { | ||
|
||
float param::p_reproduce() const { return detail::p_reproduce(*this); } | ||
|
||
int param::max_programs() const { return detail::max_programs(*this); } | ||
|
||
} // namespace genetic | ||
} // namespace cuml |
Oops, something went wrong.