Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[REVIEW] genetic programming initial structures #3387

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -400,6 +400,8 @@ if(BUILD_CUML_CPP_LIBRARY)
src/fil/fil.cu
src/fil/infer.cu
src/glm/glm.cu
src/genetic/genetic.cu
src/genetic/node.cu
src/holtwinters/holtwinters.cu
src/kmeans/kmeans.cu
src/knn/knn.cu
Expand Down
130 changes: 130 additions & 0 deletions cpp/include/cuml/genetic/genetic.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
/*
* Copyright (c) 2020-2021, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#pragma once

#include "node.h"
#include "program.h"

#include <cstdint>
#include <string>
#include <vector>

namespace cuml {
namespace genetic {

/** Type of initialization of the member programs in the population */
enum class init_method_t : uint32_t {
/** random nodes chosen, allowing shorter or asymmetrical trees */
grow,
/** growing till a randomly chosen depth */
full,
/** 50% of the population on `grow` and the rest with `full` */
half_and_half,
}; // enum class init_method_t

/** fitness metric types */
enum class metric_t : uint32_t {
/** mean absolute error (regression-only) */
mae,
/** mean squared error (regression-only) */
mse,
/** root mean squared error (regression-only) */
rmse,
/** pearson product-moment coefficient (regression and transformation) */
pearson,
/** spearman's rank-order coefficient (regression and transformation) */
spearman,
/** binary cross-entropy loss (classification-only) */
logloss,
}; // enum class metric_t

enum class transformer_t : uint32_t {
/** sigmoid function */
sigmoid,
}; // enum class transformer_t

/**
* @brief contains all the hyper-parameters for training
*
* @note Unless otherwise mentioned, all the parameters below are applicable to
* all of classification, regression and transformation.
*/
struct param {
/** number of programs in each generation */
int population_size = 1000;
/**
* number of fittest programs to compare during correlation
* (transformation-only)
*/
int hall_of_fame = 100;
/**
* number of fittest programs to return from `hall_of_fame` top programs
* (transformation-only)
*/
int n_components = 10;
/** number of generations to evolve */
int generations = 20;
/**
* number of programs that compete in the tournament to become part of next
* generation
*/
int tournament_size = 20;
/** metric threshold used for early stopping */
float stopping_criteria = 0.0f;
/** minimum/maximum value for `constant` nodes */
float const_range[2] = {-1.0f, 1.0f};
/** minimum/maximum depth of programs after initialization */
int init_depth[2] = {2, 6};
/** initialization method */
init_method_t init_method = init_method_t::half_and_half;
/** list of functions to choose from */
std::vector<node::type> function_set{node::type::add, node::type::mul,
node::type::div, node::type::sub};
/** transformation function to class probabilities (classification-only) */
transformer_t transformer = transformer_t::sigmoid;
/** fitness metric */
metric_t metric = metric_t::mae;
/** penalization factor for large programs */
float parsimony_coefficient = 0.001f;
/** crossover mutation probability of the tournament winner */
float p_crossover = 0.9f;
/** subtree mutation probability of the tournament winner*/
float p_subtree_mutation = 0.01f;
/** hoist mutation probability of the tournament winner */
float p_hoist_mutation = 0.01f;
/** point mutation probabiilty of the tournament winner */
float p_point_mutation = 0.01f;
/** point replace probabiility for point mutations */
float p_point_replace = 0.05f;
/** subsampling factor */
float max_samples = 1.0f;
/** list of feature names for generating syntax trees from the programs */
std::vector<std::string> feature_names;
///@todo: feature_names
///@todo: verbose
/** random seed used for RNG */
uint64_t random_state = 0ull;

/** Computes the probability of 'reproduction' */
float p_reproduce() const;

/** maximum possible number of programs */
int max_programs() const;
}; // struct param

} // namespace genetic
} // namespace cuml
162 changes: 162 additions & 0 deletions cpp/include/cuml/genetic/node.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,162 @@
/*
* Copyright (c) 2020-2021, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#pragma once

#include <cstdint>
#include <string>

namespace cuml {
namespace genetic {

/**
* @brief Represents a node in the syntax tree.
*
* @code{.cpp}
* // A non-terminal (aka function) node
* node func_node{node::type::sub};
* // A constant node
* float const_value = 2.f;
* node const_node{const_value};
* // A variable (aka feature) node
* node var_node{20};
* @endcode
*/
struct node {
/**
* @brief All possible types of nodes. For simplicity, all the terminal and
* non-terminal types are clubbed together
*/
enum class type : uint32_t {
variable = 0,
constant,

// note: keep the case statements in alphabetical order under each category
// of operators.
functions_begin,
// different binary function types follow
binary_begin = functions_begin,
add = binary_begin,
atan2,
div,
fdim,
max,
min,
mul,
pow,
sub,
binary_end = sub, // keep this to be the last binary function in the list
// different unary function types follow
unary_begin,
abs = unary_begin,
acos,
acosh,
asin,
asinh,
atan,
atanh,
cbrt,
cos,
cosh,
cube,
exp,
inv,
log,
neg,
rcbrt,
rsqrt,
sin,
sinh,
sq,
sqrt,
tan,
tanh,
unary_end = tanh, // keep this to be the last unary function in the list
functions_end = unary_end,
}; // enum type

/**
* @brief Construct a function node
*
* @param[in] ft function type
*/
explicit node(type ft);

/**
* @brief Construct a variable node
*
* @param[in] fid feature id that represents the variable
*/
explicit node(int fid);

/**
* @brief Construct a constant node
*
* @param[in] val constant value
*/
explicit node(float val);

/**
* @param[in] src source node to be copied
*/
explicit node(const node& src);

/**
* @brief assignment operator
*
* @param[in] src source node to be copied
*
* @return current node reference
*/
node& operator=(const node& src);

/** whether the current is either a variable or a constant */
bool is_terminal() const;

/** whether the current node is a function */
bool is_nonterminal() const;

/** Get the arity of the node. If it is a terminal, then a 0 is returned */
int arity() const;

/**
* @brief Helper method to get node type from input string
*
* @param[in] ntype node type in string. Possible strings correlate one-to-one
* with the enum values for `type`
*
* @return `type`
*/
static type from_str(const std::string& ntype);

/** constant used to represent invalid feature id */
static const int kInvalidFeatureId;

/** node type */
type t;
union {
/**
* if the node is `variable` type, then this is the column id to be used to
* fetch its value, from the input dataset
*/
int fid;
/** if the node is `constant` type, then this is the value of the node */
float val;
} u;
}; // struct node

} // namespace genetic
} // namespace cuml
47 changes: 47 additions & 0 deletions cpp/include/cuml/genetic/program.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
/*
* Copyright (c) 2021, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#pragma once

#include "node.h"

namespace cuml {
namespace genetic {

/**
* @brief The main data structure to store the AST that represents a program
* in the current generation
*/
struct program {
/**
* the AST. It is stored in the reverse of DFS-right-child-first order. In
* other words, construct a regular AST in the form of depth-first, but
* instead of storing the left child first, store the right child and so on.
* Now take the resulting 1D array and reverse it.
*
* @note The pointed memory buffer is NOT owned by this class and further it
* is assumed to be a zero-copy (aka pinned memory) buffer, atleast in
* this initial version
*/
node* nodes;
/** total number of nodes in this AST */
int len;
/** maximum depth of this AST */
int depth;
}; // struct program

} // namespace genetic
} // namespace cuml
26 changes: 26 additions & 0 deletions cpp/src/genetic/genetic.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
/*
* Copyright (c) 2020-2021, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include "genetic.cuh"
namespace cuml {
namespace genetic {

float param::p_reproduce() const { return detail::p_reproduce(*this); }

int param::max_programs() const { return detail::max_programs(*this); }

} // namespace genetic
} // namespace cuml
Loading