diff --git a/docs/Parameters.rst b/docs/Parameters.rst
index a996b0132852..19e646126f7c 100644
--- a/docs/Parameters.rst
+++ b/docs/Parameters.rst
@@ -312,6 +312,18 @@ Learning Control Parameters
 
    -  dropout rate: a fraction of previous trees to drop during the dropout
 
+-  ``monotone_penalty`` :raw-html:`<a id="monotone_penalty" title="Permalink to this parameter" href="#monotone_penalty">&#x1F517;&#xFE0E;</a>`, default = ``0.``, type = double, aliases: ``monotone_splits_penalty``, constraints: ``0.0 <= monotone_penalty (< max_depth, if max_depth > 0)``
+
+   -  used only if ``monotone_constraints`` is set
+
+   -  monotone penalty: a penalization of 0 equals to no penalization. A penalization parameter X forbids any monotone splits on the first X (rounded down) level(s) of the tree. The penalty applied to monotone splits on a given depth is a continuous, increasing function the penalization parameter
+
+-  ``monotone_precise_method`` :raw-html:`<a id="monotone_precise_method" title="Permalink to this parameter" href="#monotone_precise_method">&#x1F517;&#xFE0E;</a>`, default = ``false``, type = bool, aliases: ``monotone_constraints_precise_mode``
+
+   -  used only if ``monotone_constraints`` is set
+
+   -  monotone precise method`: if set to false then the program will run as fast as without constraints, but the results may be over-constrained. If set to true, then the program will be slower, but results will be better. Note that if there are categorical features, in the dataset, they will be splitted using the fast method regardless of this parameter. Also, the parameter can only be set to true if the missing handle is disabled
+
 -  ``max_drop`` :raw-html:`<a id="max_drop" title="Permalink to this parameter" href="#max_drop">&#x1F517;&#xFE0E;</a>`, default = ``50``, type = int
 
    -  used only in ``dart``
diff --git a/include/LightGBM/config.h b/include/LightGBM/config.h
index 974735532c29..0230ae245d0f 100644
--- a/include/LightGBM/config.h
+++ b/include/LightGBM/config.h
@@ -325,6 +325,18 @@ struct Config {
   // desc = dropout rate: a fraction of previous trees to drop during the dropout
   double drop_rate = 0.1;
 
+  // alias = monotone_splits_penalty
+  // check = >=0.0
+  // check = <max_depth; if max_depth > 0
+  // desc = used only if ``monotone_constraints`` is set
+  // desc = monotone penalty: a penalization of 0 equals to no penalization. A penalization parameter X forbids any monotone splits on the first X (rounded down) level(s) of the tree. The penalty applied to monotone splits on a given depth is a continuous, increasing function the penalization parameter
+  double monotone_penalty = 0.;
+
+  // alias = monotone_constraints_precise_mode
+  // desc = used only if ``monotone_constraints`` is set
+  // desc = monotone precise mode: if set to false then the program will run as fast as without constraints, but the results may be over-constrained. If set to true, then the program will be slower, but results will be better. Note that if there are categorical features, in the dataset, they will be splitted using the fast method regardless of this parameter. Also, the parameter can only be set to true if the missing handle is disabled
+  bool monotone_precise_mode = false;
+
   // desc = used only in ``dart``
   // desc = max number of dropped trees during one boosting iteration
   // desc = ``<=0`` means no limit
diff --git a/include/LightGBM/tree.h b/include/LightGBM/tree.h
index f672f62b347d..6bc7130fcdb3 100644
--- a/include/LightGBM/tree.h
+++ b/include/LightGBM/tree.h
@@ -60,7 +60,7 @@ class Tree {
   int Split(int leaf, int feature, int real_feature, uint32_t threshold_bin,
             double threshold_double, double left_value, double right_value,
             int left_cnt, int right_cnt, double left_weight, double right_weight,
-            float gain, MissingType missing_type, bool default_left);
+            float gain, MissingType missing_type, bool default_left, bool feature_is_monotone);
 
   /*!
   * \brief Performing a split on tree leaves, with categorical feature
@@ -80,9 +80,14 @@ class Tree {
   * \param gain Split gain
   * \return The index of new leaf.
   */
-  int SplitCategorical(int leaf, int feature, int real_feature, const uint32_t* threshold_bin, int num_threshold_bin,
-                       const uint32_t* threshold, int num_threshold, double left_value, double right_value,
-                       int left_cnt, int right_cnt, double left_weight, double right_weight, float gain, MissingType missing_type);
+
+  int SplitCategorical(int leaf, int feature, int real_feature,
+                       const uint32_t *threshold_bin, int num_threshold_bin,
+                       const uint32_t *threshold, int num_threshold,
+                       double left_value, double right_value, int left_cnt,
+                       int right_cnt, double left_weight, double right_weight,
+                       float gain, MissingType missing_type,
+                       bool feature_is_monotone);
 
   /*! \brief Get the output of one leaf */
   inline double LeafOutput(int leaf) const { return leaf_value_[leaf]; }
@@ -124,6 +129,24 @@ class Tree {
   inline int PredictLeafIndex(const double* feature_values) const;
   inline int PredictLeafIndexByMap(const std::unordered_map<int, double>& feature_values) const;
 
+  // Get node parent
+  inline int node_parent(int node_idx) const;
+  // Get leaf parent
+  inline int leaf_parent(int node_idx) const;
+
+  // Get children
+  inline int left_child(int node_idx) const;
+  inline int right_child(int node_idx) const;
+
+  // Get if the feature is in a monotone subtree
+  inline bool leaf_is_in_monotone_subtree(int leaf_idx) const;
+
+  inline double internal_value(int node_idx) const;
+
+  inline uint32_t threshold_in_bin(int node_idx) const;
+
+  // Get the feature corresponding to the split
+  inline int split_feature_inner(int node_idx) const;
 
   inline void PredictContrib(const double* feature_values, int num_features, double* output);
 
@@ -302,8 +325,10 @@ class Tree {
     }
   }
 
-  inline void Split(int leaf, int feature, int real_feature, double left_value, double right_value, int left_cnt, int right_cnt,
-                    double left_weight, double right_weight, float gain);
+  inline void Split(int leaf, int feature, int real_feature, double left_value,
+                    double right_value, int left_cnt, int right_cnt, double left_weight,
+                    double right_weight,float gain, bool feature_is_monotone);
+
   /*!
   * \brief Find leaf index of which record belongs by features
   * \param feature_values Feature value of this record
@@ -402,12 +427,22 @@ class Tree {
   std::vector<int> leaf_depth_;
   double shrinkage_;
   int max_depth_;
+  // add parent node information
+  std::vector<int> node_parent_;
+  // Keeps track of the monotone splits above the leaf
+  std::vector<bool> leaf_is_in_monotone_subtree_;
 };
 
 inline void Tree::Split(int leaf, int feature, int real_feature,
                         double left_value, double right_value, int left_cnt, int right_cnt,
-                        double left_weight, double right_weight, float gain) {
+                        double left_weight, double right_weight, float gain, bool feature_is_monotone) {
   int new_node_idx = num_leaves_ - 1;
+
+  // Update if there is a monotone split above the leaf
+  if (feature_is_monotone || leaf_is_in_monotone_subtree_[leaf]) {
+    leaf_is_in_monotone_subtree_[leaf] = true;
+    leaf_is_in_monotone_subtree_[num_leaves_] = true;
+  }
   // update parent info
   int parent = leaf_parent_[leaf];
   if (parent >= 0) {
@@ -421,6 +456,7 @@ inline void Tree::Split(int leaf, int feature, int real_feature,
   // add new node
   split_feature_inner_[new_node_idx] = feature;
   split_feature_[new_node_idx] = real_feature;
+  node_parent_[new_node_idx] = parent;
 
   split_gain_[new_node_idx] = gain;
   // add two new leaves
@@ -529,6 +565,41 @@ inline int Tree::GetLeafByMap(const std::unordered_map<int, double>& feature_val
   return ~node;
 }
 
+inline int Tree::node_parent(int node_idx) const{
+    return node_parent_[node_idx];
+}
+
+inline int Tree::left_child(int node_idx) const{
+    return left_child_[node_idx];
+}
+
+inline int Tree::right_child(int node_idx) const{
+    return right_child_[node_idx];
+}
+
+inline int Tree::split_feature_inner(int node_idx) const{
+    return split_feature_inner_[node_idx];
+}
+
+inline int Tree::leaf_parent(int node_idx) const{
+    return leaf_parent_[node_idx];
+}
+
+inline uint32_t Tree::threshold_in_bin(int node_idx) const{
+    #ifdef DEBUG
+    CHECK(node_idx >= 0);
+    #endif
+    return threshold_in_bin_[node_idx];
+}
+
+inline bool Tree::leaf_is_in_monotone_subtree(int leaf_idx) const {
+    return leaf_is_in_monotone_subtree_[leaf_idx];
+}
+
+inline double Tree::internal_value(int node_idx) const {
+    return internal_value_[node_idx];
+}
+
 
 }  // namespace LightGBM
 
diff --git a/src/boosting/gbdt_model_text.cpp b/src/boosting/gbdt_model_text.cpp
index 4e44b2a2527a..e080bcb1d214 100644
--- a/src/boosting/gbdt_model_text.cpp
+++ b/src/boosting/gbdt_model_text.cpp
@@ -540,6 +540,9 @@ std::vector<double> GBDT::FeatureImportance(int num_iteration, int importance_ty
     for (int iter = 0; iter < num_used_model; ++iter) {
       for (int split_idx = 0; split_idx < models_[iter]->num_leaves() - 1; ++split_idx) {
         if (models_[iter]->split_gain(split_idx) > 0) {
+          #ifdef DEBUG
+          CHECK(models_[iter]->split_feature(split_idx) >= 0);
+          #endif
           feature_importances[models_[iter]->split_feature(split_idx)] += 1.0;
         }
       }
@@ -548,6 +551,9 @@ std::vector<double> GBDT::FeatureImportance(int num_iteration, int importance_ty
     for (int iter = 0; iter < num_used_model; ++iter) {
       for (int split_idx = 0; split_idx < models_[iter]->num_leaves() - 1; ++split_idx) {
         if (models_[iter]->split_gain(split_idx) > 0) {
+          #ifdef DEBUG
+          CHECK(models_[iter]->split_feature(split_idx) >= 0);
+          #endif
           feature_importances[models_[iter]->split_feature(split_idx)] += models_[iter]->split_gain(split_idx);
         }
       }
diff --git a/src/io/config_auto.cpp b/src/io/config_auto.cpp
index 809b8a7843aa..0534c8f6dfa0 100644
--- a/src/io/config_auto.cpp
+++ b/src/io/config_auto.cpp
@@ -6,6 +6,7 @@
  * This file is auto generated by LightGBM\helpers\parameter_generator.py from LightGBM\include\LightGBM\config.h file.
  */
 #include<LightGBM/config.h>
+#include <LightGBM/utils/log.h>
 namespace LightGBM {
 std::unordered_map<std::string, std::string> Config::alias_table({
   {"config_file", "config"},
@@ -80,6 +81,8 @@ std::unordered_map<std::string, std::string> Config::alias_table({
   {"lambda", "lambda_l2"},
   {"min_split_gain", "min_gain_to_split"},
   {"rate_drop", "drop_rate"},
+  {"monotone_splits_penalty", "monotone_penalty"},
+  {"monotone_constraints_precise_mode", "monotone_precise_mode"},
   {"topk", "top_k"},
   {"mc", "monotone_constraints"},
   {"monotone_constraint", "monotone_constraints"},
@@ -199,6 +202,8 @@ std::unordered_set<std::string> Config::parameter_set({
   "lambda_l2",
   "min_gain_to_split",
   "drop_rate",
+  "monotone_penalty",
+  "monotone_precise_mode",
   "max_drop",
   "skip_drop",
   "xgboost_dart_mode",
@@ -399,8 +404,21 @@ void Config::GetMembersFromString(const std::unordered_map<std::string, std::str
 
   if (GetString(params, "monotone_constraints", &tmp_str)) {
     monotone_constraints = Common::StringToArray<int8_t>(tmp_str, ',');
+    Log::Warning("The constraining method was just changed, which could significantly affect results of the algorithm");
   }
 
+  GetDouble(params, "monotone_penalty", &monotone_penalty);
+  bool constraints_exist = false;
+  for (auto it = monotone_constraints.begin(); it != monotone_constraints.end();
+       it++) {
+    if (*it != 0) {
+      constraints_exist = true;
+    }
+  }
+  CHECK(monotone_penalty == 0 || constraints_exist);
+  CHECK(max_depth <= 0 || monotone_penalty < max_depth);
+  CHECK(monotone_penalty >= 0.0);
+
   if (GetString(params, "feature_contri", &tmp_str)) {
     feature_contri = Common::StringToArray<double>(tmp_str, ',');
   }
@@ -476,6 +494,10 @@ void Config::GetMembersFromString(const std::unordered_map<std::string, std::str
 
   GetBool(params, "use_missing", &use_missing);
 
+  GetBool(params, "monotone_precise_mode", &monotone_precise_mode);
+  CHECK(!monotone_precise_mode || !use_missing);
+  CHECK(!monotone_precise_mode || constraints_exist);
+
   GetBool(params, "zero_as_missing", &zero_as_missing);
 
   GetBool(params, "two_round", &two_round);
@@ -607,6 +629,8 @@ std::string Config::SaveMembersToString() const {
   str_buf << "[lambda_l2: " << lambda_l2 << "]\n";
   str_buf << "[min_gain_to_split: " << min_gain_to_split << "]\n";
   str_buf << "[drop_rate: " << drop_rate << "]\n";
+  str_buf << "[monotone_penalty: " << monotone_penalty << "]\n";
+  str_buf << "[monotone_precise_mode: " << monotone_precise_mode << "]\n";
   str_buf << "[max_drop: " << max_drop << "]\n";
   str_buf << "[skip_drop: " << skip_drop << "]\n";
   str_buf << "[xgboost_dart_mode: " << xgboost_dart_mode << "]\n";
diff --git a/src/io/tree.cpp b/src/io/tree.cpp
index db0803bd3108..6fcbfbe321d6 100644
--- a/src/io/tree.cpp
+++ b/src/io/tree.cpp
@@ -24,7 +24,9 @@ Tree::Tree(int max_leaves)
   threshold_.resize(max_leaves_ - 1);
   decision_type_.resize(max_leaves_ - 1, 0);
   split_gain_.resize(max_leaves_ - 1);
+  node_parent_.resize(max_leaves_ - 1);
   leaf_parent_.resize(max_leaves_);
+  leaf_is_in_monotone_subtree_.resize(max_leaves_);
   leaf_value_.resize(max_leaves_);
   leaf_weight_.resize(max_leaves_);
   leaf_count_.resize(max_leaves_);
@@ -38,6 +40,7 @@ Tree::Tree(int max_leaves)
   leaf_value_[0] = 0.0f;
   leaf_weight_[0] = 0.0f;
   leaf_parent_[0] = -1;
+  node_parent_[0] = -1;
   shrinkage_ = 1.0f;
   num_cat_ = 0;
   cat_boundaries_.push_back(0);
@@ -50,8 +53,11 @@ Tree::~Tree() {
 
 int Tree::Split(int leaf, int feature, int real_feature, uint32_t threshold_bin,
                 double threshold_double, double left_value, double right_value,
-                int left_cnt, int right_cnt, double left_weight, double right_weight, float gain, MissingType missing_type, bool default_left) {
-  Split(leaf, feature, real_feature, left_value, right_value, left_cnt, right_cnt, left_weight, right_weight, gain);
+                int left_cnt, int right_cnt, double left_weight,
+                double right_weight, float gain,
+                MissingType missing_type, bool default_left,
+                bool feature_was_monotone) {
+  Split(leaf, feature, real_feature, left_value, right_value, left_cnt, right_cnt, left_weight, right_weight, gain, feature_was_monotone);
   int new_node_idx = num_leaves_ - 1;
   decision_type_[new_node_idx] = 0;
   SetDecisionType(&decision_type_[new_node_idx], false, kCategoricalMask);
@@ -69,10 +75,15 @@ int Tree::Split(int leaf, int feature, int real_feature, uint32_t threshold_bin,
   return num_leaves_ - 1;
 }
 
-int Tree::SplitCategorical(int leaf, int feature, int real_feature, const uint32_t* threshold_bin, int num_threshold_bin,
-                           const uint32_t* threshold, int num_threshold, double left_value, double right_value,
-                           data_size_t left_cnt, data_size_t right_cnt, double left_weight, double right_weight, float gain, MissingType missing_type) {
-  Split(leaf, feature, real_feature, left_value, right_value, left_cnt, right_cnt, left_weight, right_weight, gain);
+int Tree::SplitCategorical(int leaf, int feature, int real_feature,
+                           const uint32_t *threshold_bin, int num_threshold_bin,
+                           const uint32_t *threshold, int num_threshold,
+                           double left_value, double right_value,
+                           data_size_t left_cnt, data_size_t right_cnt,
+                           double left_weight, double right_weight,
+                           float gain, MissingType missing_type,
+                           bool feature_was_monotone) {
+  Split(leaf, feature, real_feature, left_value, right_value, left_cnt, right_cnt, left_weight, right_weight, gain, feature_was_monotone);
   int new_node_idx = num_leaves_ - 1;
   decision_type_[new_node_idx] = 0;
   SetDecisionType(&decision_type_[new_node_idx], true, kCategoricalMask);
diff --git a/src/treelearner/cost_effective_gradient_boosting.hpp b/src/treelearner/cost_effective_gradient_boosting.hpp
index 82c6b9abc120..dfca4da46477 100644
--- a/src/treelearner/cost_effective_gradient_boosting.hpp
+++ b/src/treelearner/cost_effective_gradient_boosting.hpp
@@ -83,6 +83,10 @@ class CostEfficientGradientBoosting {
     }
   }
 
+  SplitInfo const & GetSplitInfo(int i) const {
+    return splits_per_leaf_[i];
+  }
+
  private:
   double CalculateOndemandCosts(int feature_index, int real_fidx, int leaf_index) const {
     if (tree_learner_->config_->cegb_penalty_feature_lazy.empty()) {
diff --git a/src/treelearner/data_parallel_tree_learner.cpp b/src/treelearner/data_parallel_tree_learner.cpp
index ed677ecf88d5..87073099e9ce 100644
--- a/src/treelearner/data_parallel_tree_learner.cpp
+++ b/src/treelearner/data_parallel_tree_learner.cpp
@@ -146,7 +146,7 @@ void DataParallelTreeLearner<TREELEARNER_T>::BeforeTrain() {
 }
 
 template <typename TREELEARNER_T>
-void DataParallelTreeLearner<TREELEARNER_T>::FindBestSplits() {
+void DataParallelTreeLearner<TREELEARNER_T>::FindBestSplits(const Tree* tree) {
   TREELEARNER_T::ConstructHistograms(this->is_feature_used_, true);
   // construct local histograms
   #pragma omp parallel for schedule(static)
@@ -160,11 +160,12 @@ void DataParallelTreeLearner<TREELEARNER_T>::FindBestSplits() {
   // Reduce scatter for histogram
   Network::ReduceScatter(input_buffer_.data(), reduce_scatter_size_, sizeof(HistogramBinEntry), block_start_.data(),
                          block_len_.data(), output_buffer_.data(), static_cast<comm_size_t>(output_buffer_.size()), &HistogramBinEntry::SumReducer);
-  this->FindBestSplitsFromHistograms(this->is_feature_used_, true);
+  this->FindBestSplitsFromHistograms(this->is_feature_used_, true, tree);
 }
 
 template <typename TREELEARNER_T>
-void DataParallelTreeLearner<TREELEARNER_T>::FindBestSplitsFromHistograms(const std::vector<int8_t>&, bool) {
+void DataParallelTreeLearner<TREELEARNER_T>::FindBestSplitsFromHistograms(
+    const std::vector<int8_t> &, bool, const Tree *tree) {
   std::vector<SplitInfo> smaller_bests_per_thread(this->num_threads_, SplitInfo());
   std::vector<SplitInfo> larger_bests_per_thread(this->num_threads_, SplitInfo());
   std::vector<int8_t> smaller_node_used_features(this->num_features_, 1);
@@ -190,13 +191,14 @@ void DataParallelTreeLearner<TREELEARNER_T>::FindBestSplitsFromHistograms(const
                                     this->smaller_leaf_histogram_array_[feature_index].RawData());
     SplitInfo smaller_split;
     // find best threshold for smaller child
+    // FIXME Fill the vectors with the actual constraints and thresholds
+    SplittingConstraints *constraints;
+    std::vector<uint32_t> thresholds;
     this->smaller_leaf_histogram_array_[feature_index].FindBestThreshold(
-      this->smaller_leaf_splits_->sum_gradients(),
-      this->smaller_leaf_splits_->sum_hessians(),
-      GetGlobalDataCountInLeaf(this->smaller_leaf_splits_->LeafIndex()),
-      this->smaller_leaf_splits_->min_constraint(),
-      this->smaller_leaf_splits_->max_constraint(),
-      &smaller_split);
+        this->smaller_leaf_splits_->sum_gradients(),
+        this->smaller_leaf_splits_->sum_hessians(),
+        GetGlobalDataCountInLeaf(this->smaller_leaf_splits_->LeafIndex()),
+        &smaller_split, constraints);
     smaller_split.feature = real_feature_index;
     if (smaller_split > smaller_bests_per_thread[tid] && smaller_node_used_features[feature_index]) {
       smaller_bests_per_thread[tid] = smaller_split;
@@ -210,13 +212,12 @@ void DataParallelTreeLearner<TREELEARNER_T>::FindBestSplitsFromHistograms(const
       this->smaller_leaf_histogram_array_[feature_index]);
     SplitInfo larger_split;
     // find best threshold for larger child
+    // FIXME Fill the vectors with the actual constraints and thresholds
     this->larger_leaf_histogram_array_[feature_index].FindBestThreshold(
-      this->larger_leaf_splits_->sum_gradients(),
-      this->larger_leaf_splits_->sum_hessians(),
-      GetGlobalDataCountInLeaf(this->larger_leaf_splits_->LeafIndex()),
-      this->larger_leaf_splits_->min_constraint(),
-      this->larger_leaf_splits_->max_constraint(),
-      &larger_split);
+        this->larger_leaf_splits_->sum_gradients(),
+        this->larger_leaf_splits_->sum_hessians(),
+        GetGlobalDataCountInLeaf(this->larger_leaf_splits_->LeafIndex()),
+        &larger_split, constraints);
     larger_split.feature = real_feature_index;
     if (larger_split > larger_bests_per_thread[tid] && larger_node_used_features[feature_index]) {
       larger_bests_per_thread[tid] = larger_split;
diff --git a/src/treelearner/feature_histogram.hpp b/src/treelearner/feature_histogram.hpp
index 20437c0be6b0..35cc51fc16a7 100644
--- a/src/treelearner/feature_histogram.hpp
+++ b/src/treelearner/feature_histogram.hpp
@@ -16,6 +16,7 @@
 #include <vector>
 
 #include "split_info.hpp"
+#include "monotone_constraints.hpp"
 
 namespace LightGBM {
 
@@ -57,11 +58,15 @@ class FeatureHistogram {
     meta_ = meta;
     data_ = data;
     if (meta_->bin_type == BinType::NumericalBin) {
-      find_best_threshold_fun_ = std::bind(&FeatureHistogram::FindBestThresholdNumerical, this, std::placeholders::_1
-                                           , std::placeholders::_2, std::placeholders::_3, std::placeholders::_4, std::placeholders::_5, std::placeholders::_6);
+      find_best_threshold_fun_ = std::bind(
+          &FeatureHistogram::FindBestThresholdNumerical, this,
+          std::placeholders::_1, std::placeholders::_2, std::placeholders::_3,
+          std::placeholders::_4, std::placeholders::_5);
     } else {
-      find_best_threshold_fun_ = std::bind(&FeatureHistogram::FindBestThresholdCategorical, this, std::placeholders::_1
-                                           , std::placeholders::_2, std::placeholders::_3, std::placeholders::_4, std::placeholders::_5, std::placeholders::_6);
+      find_best_threshold_fun_ = std::bind(
+          &FeatureHistogram::FindBestThresholdCategorical, this,
+          std::placeholders::_1, std::placeholders::_2, std::placeholders::_3,
+          std::placeholders::_4, std::placeholders::_5);
     }
   }
 
@@ -80,30 +85,50 @@ class FeatureHistogram {
     }
   }
 
-  void FindBestThreshold(double sum_gradient, double sum_hessian, data_size_t num_data, double min_constraint, double max_constraint,
-                         SplitInfo* output) {
+  void FindBestThreshold(double sum_gradient, double sum_hessian,
+                         data_size_t num_data, SplitInfo *output,
+                         SplittingConstraints *constraints) {
     output->default_left = true;
     output->gain = kMinScore;
-    find_best_threshold_fun_(sum_gradient, sum_hessian + 2 * kEpsilon, num_data, min_constraint, max_constraint, output);
+    find_best_threshold_fun_(sum_gradient, sum_hessian + 2 * kEpsilon, num_data,
+                             output, constraints);
     output->gain *= meta_->penalty;
   }
 
-  void FindBestThresholdNumerical(double sum_gradient, double sum_hessian, data_size_t num_data, double min_constraint, double max_constraint,
-                                  SplitInfo* output) {
+  void FindBestThresholdNumerical(double sum_gradient, double sum_hessian,
+                                  data_size_t num_data, SplitInfo *output,
+                                  SplittingConstraints *constraints) {
     is_splittable_ = false;
     double gain_shift = GetLeafSplitGain(sum_gradient, sum_hessian,
                                          meta_->config->lambda_l1, meta_->config->lambda_l2, meta_->config->max_delta_step);
     double min_gain_shift = gain_shift + meta_->config->min_gain_to_split;
+
+    // at this point, the following arrays contain the constraints applied on every part of the leaf
+    // since we are splitting the leaf in 2, we can compute the cumulative / minimum maximum in both directions
+    if (constraints != nullptr) {
+      constraints->ComputeCumulativeExtremums();
+    }
+
     if (meta_->num_bin > 2 && meta_->missing_type != MissingType::None) {
       if (meta_->missing_type == MissingType::Zero) {
-        FindBestThresholdSequence(sum_gradient, sum_hessian, num_data, min_constraint, max_constraint, min_gain_shift, output, -1, true, false);
-        FindBestThresholdSequence(sum_gradient, sum_hessian, num_data, min_constraint, max_constraint, min_gain_shift, output, 1, true, false);
+        FindBestThresholdSequence(
+            sum_gradient, sum_hessian, num_data, min_gain_shift, output, -1,
+            true, false, constraints);
+        FindBestThresholdSequence(
+            sum_gradient, sum_hessian, num_data, min_gain_shift, output, 1,
+            true, false, constraints);
       } else {
-        FindBestThresholdSequence(sum_gradient, sum_hessian, num_data, min_constraint, max_constraint, min_gain_shift, output, -1, false, true);
-        FindBestThresholdSequence(sum_gradient, sum_hessian, num_data, min_constraint, max_constraint, min_gain_shift, output, 1, false, true);
+        FindBestThresholdSequence(
+            sum_gradient, sum_hessian, num_data, min_gain_shift, output, -1,
+            false, true, constraints);
+        FindBestThresholdSequence(
+            sum_gradient, sum_hessian, num_data, min_gain_shift, output, 1,
+            false, true, constraints);
       }
     } else {
-      FindBestThresholdSequence(sum_gradient, sum_hessian, num_data, min_constraint, max_constraint, min_gain_shift, output, -1, false, false);
+      FindBestThresholdSequence(
+          sum_gradient, sum_hessian, num_data, min_gain_shift, output, -1,
+          false, false, constraints);
       // fix the direction error when only have 2 bins
       if (meta_->missing_type == MissingType::NaN) {
         output->default_left = false;
@@ -111,13 +136,11 @@ class FeatureHistogram {
     }
     output->gain -= min_gain_shift;
     output->monotone_type = meta_->monotone_type;
-    output->min_constraint = min_constraint;
-    output->max_constraint = max_constraint;
   }
 
-  void FindBestThresholdCategorical(double sum_gradient, double sum_hessian, data_size_t num_data,
-                                    double min_constraint, double max_constraint,
-                                    SplitInfo* output) {
+  void FindBestThresholdCategorical(
+      double sum_gradient, double sum_hessian, data_size_t num_data,
+      SplitInfo *output, SplittingConstraints *constraints) {
     output->default_left = false;
     double best_gain = kMinScore;
     data_size_t best_left_count = 0;
@@ -135,6 +158,10 @@ class FeatureHistogram {
     int best_threshold = -1;
     int best_dir = 1;
 
+    if (constraints != nullptr) {
+      constraints->InitializeIndices(1);
+    }
+
     if (use_onehot) {
       for (int t = 0; t < used_bin; ++t) {
         // if data not enough, or sum hessian too small
@@ -149,10 +176,16 @@ class FeatureHistogram {
         if (sum_other_hessian < meta_->config->min_sum_hessian_in_leaf) continue;
 
         double sum_other_gradient = sum_gradient - data_[t].sum_gradients;
+
+#ifdef DEBUG
+        CHECK(t >= 0);
+#endif
         // current split gain
-        double current_gain = GetSplitGains(sum_other_gradient, sum_other_hessian, data_[t].sum_gradients, data_[t].sum_hessians + kEpsilon,
-                                            meta_->config->lambda_l1, l2, meta_->config->max_delta_step,
-                                            min_constraint, max_constraint, 0);
+        // the threshold is included in the left leaf
+        double current_gain = GetSplitGains(
+            sum_other_gradient, sum_other_hessian, data_[t].sum_gradients,
+            data_[t].sum_hessians + kEpsilon, meta_->config->lambda_l1, l2,
+            meta_->config->max_delta_step, constraints, 0);
         // gain with split is worse than without split
         if (current_gain <= min_gain_shift) continue;
 
@@ -222,9 +255,12 @@ class FeatureHistogram {
           cnt_cur_group = 0;
 
           double sum_right_gradient = sum_gradient - sum_left_gradient;
-          double current_gain = GetSplitGains(sum_left_gradient, sum_left_hessian, sum_right_gradient, sum_right_hessian,
-                                              meta_->config->lambda_l1, l2, meta_->config->max_delta_step,
-                                              min_constraint, max_constraint, 0);
+          // the threshold is included in the left leaf
+          double current_gain = GetSplitGains(
+              sum_left_gradient, sum_left_hessian, sum_right_gradient,
+              sum_right_hessian, meta_->config->lambda_l1, l2,
+              meta_->config->max_delta_step, constraints, 0);
+
           if (current_gain <= min_gain_shift) continue;
           is_splittable_ = true;
           if (current_gain > best_gain) {
@@ -240,16 +276,29 @@ class FeatureHistogram {
     }
 
     if (is_splittable_) {
-      output->left_output = CalculateSplittedLeafOutput(best_sum_left_gradient, best_sum_left_hessian,
-                                                        meta_->config->lambda_l1, l2, meta_->config->max_delta_step,
-                                                        min_constraint, max_constraint);
+      SplittingConstraint *left;
+      SplittingConstraint *right;
+      if (constraints != nullptr) {
+        left = &(constraints->left);
+        right = &(constraints-> right);
+      }
+      else {
+        left =  nullptr;
+        right =  nullptr;
+      }
+
+      output->left_output = CalculateSplittedLeafOutput(
+          best_sum_left_gradient, best_sum_left_hessian,
+          meta_->config->lambda_l1, l2, meta_->config->max_delta_step,
+          left);
       output->left_count = best_left_count;
       output->left_sum_gradient = best_sum_left_gradient;
       output->left_sum_hessian = best_sum_left_hessian - kEpsilon;
-      output->right_output = CalculateSplittedLeafOutput(sum_gradient - best_sum_left_gradient,
-                                                         sum_hessian - best_sum_left_hessian,
-                                                         meta_->config->lambda_l1, l2, meta_->config->max_delta_step,
-                                                         min_constraint, max_constraint);
+      output->right_output = CalculateSplittedLeafOutput(
+          sum_gradient - best_sum_left_gradient,
+          sum_hessian - best_sum_left_hessian, meta_->config->lambda_l1, l2,
+          meta_->config->max_delta_step,
+          right);
       output->right_count = num_data - best_left_count;
       output->right_sum_gradient = sum_gradient - best_sum_left_gradient;
       output->right_sum_hessian = sum_hessian - best_sum_left_hessian - kEpsilon;
@@ -273,8 +322,6 @@ class FeatureHistogram {
         }
       }
       output->monotone_type = 0;
-      output->min_constraint = min_constraint;
-      output->max_constraint = max_constraint;
     }
   }
 
@@ -441,7 +488,9 @@ class FeatureHistogram {
   /*!
   * \brief Set splittable to this histogram
   */
-  void set_is_splittable(bool val) { is_splittable_ = val; }
+  void set_is_splittable(bool val) {
+    is_splittable_ = val;
+  }
 
   static double ThresholdL1(double s, double l1) {
     const double reg_s = std::max(0.0, std::fabs(s) - l1);
@@ -457,38 +506,51 @@ class FeatureHistogram {
     }
   }
 
- private:
-  static double GetSplitGains(double sum_left_gradients, double sum_left_hessians,
-                              double sum_right_gradients, double sum_right_hessians,
-                              double l1, double l2, double max_delta_step,
-                              double min_constraint, double max_constraint, int8_t monotone_constraint) {
-    double left_output = CalculateSplittedLeafOutput(sum_left_gradients, sum_left_hessians, l1, l2, max_delta_step, min_constraint, max_constraint);
-    double right_output = CalculateSplittedLeafOutput(sum_right_gradients, sum_right_hessians, l1, l2, max_delta_step, min_constraint, max_constraint);
-    if (((monotone_constraint > 0) && (left_output > right_output)) ||
-      ((monotone_constraint < 0) && (left_output < right_output))) {
-      return 0;
-    }
-    return GetLeafSplitGainGivenOutput(sum_left_gradients, sum_left_hessians, l1, l2, left_output)
-      + GetLeafSplitGainGivenOutput(sum_right_gradients, sum_right_hessians, l1, l2, right_output);
-  }
-
   /*!
   * \brief Calculate the output of a leaf based on regularized sum_gradients and sum_hessians
   * \param sum_gradients
   * \param sum_hessians
   * \return leaf output
   */
+  template <class T>
   static double CalculateSplittedLeafOutput(double sum_gradients, double sum_hessians, double l1, double l2, double max_delta_step,
-                                            double min_constraint, double max_constraint) {
+                                            const T *constraint) {
     double ret = CalculateSplittedLeafOutput(sum_gradients, sum_hessians, l1, l2, max_delta_step);
-    if (ret < min_constraint) {
-      ret = min_constraint;
-    } else if (ret > max_constraint) {
-      ret = max_constraint;
+    if (constraint != nullptr) {
+      if (ret < constraint->GetCurrentMinConstraint()) {
+        ret = constraint->GetCurrentMinConstraint();
+      } else if (ret > constraint->GetCurrentMaxConstraint()) {
+        ret = constraint->GetCurrentMaxConstraint();
+      }
     }
     return ret;
   }
 
+ private:
+  static double GetSplitGains(double sum_left_gradients, double sum_left_hessians,
+                              double sum_right_gradients, double sum_right_hessians,
+                              double l1, double l2, double max_delta_step,
+                              const SplittingConstraints *constraints, int8_t monotone_constraint) {
+    const SplittingConstraint *left;
+    const SplittingConstraint *right;
+    if (constraints != nullptr) {
+      left = &(constraints->left);
+      right = &(constraints->right);
+    }
+    else {
+      left = nullptr;
+      right = nullptr;
+    }
+    double left_output = CalculateSplittedLeafOutput(sum_left_gradients, sum_left_hessians, l1, l2, max_delta_step, left);
+    double right_output = CalculateSplittedLeafOutput(sum_right_gradients, sum_right_hessians, l1, l2, max_delta_step, right);
+    if (((monotone_constraint > 0) && (left_output > right_output)) ||
+      ((monotone_constraint < 0) && (left_output < right_output))) {
+      return 0;
+    }
+    return GetLeafSplitGainGivenOutput(sum_left_gradients, sum_left_hessians, l1, l2, left_output)
+      + GetLeafSplitGainGivenOutput(sum_right_gradients, sum_right_hessians, l1, l2, right_output);
+  }
+
   /*!
   * \brief Calculate the split gain based on regularized sum_gradients and sum_hessians
   * \param sum_gradients
@@ -505,12 +567,20 @@ class FeatureHistogram {
     return -(2.0 * sg_l1 * output + (sum_hessians + l2) * output * output);
   }
 
-  void FindBestThresholdSequence(double sum_gradient, double sum_hessian, data_size_t num_data, double min_constraint, double max_constraint,
-                                 double min_gain_shift, SplitInfo* output, int dir, bool skip_default_bin, bool use_na_as_missing) {
+  void FindBestThresholdSequence(double sum_gradient, double sum_hessian,
+                                 data_size_t num_data, double min_gain_shift,
+                                 SplitInfo *output, int dir,
+                                 bool skip_default_bin, bool use_na_as_missing,
+                                 SplittingConstraints *constraints) {
     const int8_t bias = meta_->bias;
 
     double best_sum_left_gradient = NAN;
     double best_sum_left_hessian = NAN;
+
+    // when the monotone precise mode is enabled, then the left and the right children may not
+    // have the same min and max constraints because constraints can depend on the thresholds
+    BestConstraints best_constraints = BestConstraints();
+
     double best_gain = kMinScore;
     data_size_t best_left_count = 0;
     uint32_t best_threshold = static_cast<uint32_t>(meta_->num_bin);
@@ -523,6 +593,10 @@ class FeatureHistogram {
       int t = meta_->num_bin - 1 - bias - use_na_as_missing;
       const int t_end = 1 - bias;
 
+      if (constraints != nullptr) {
+        constraints->InitializeIndices(dir);
+      }
+
       // from right to left, and we don't need data in bin0
       for (; t >= t_end; --t) {
         // need to skip default bin
@@ -543,10 +617,21 @@ class FeatureHistogram {
         if (sum_left_hessian < meta_->config->min_sum_hessian_in_leaf) break;
 
         double sum_left_gradient = sum_gradient - sum_right_gradient;
-        // current split gain
-        double current_gain = GetSplitGains(sum_left_gradient, sum_left_hessian, sum_right_gradient, sum_right_hessian,
-                                            meta_->config->lambda_l1, meta_->config->lambda_l2, meta_->config->max_delta_step,
-                                            min_constraint, max_constraint, meta_->monotone_type);
+
+        // when the monotone precise mode in enabled, as t changes, the constraints applied on
+        // each child may change, because the constraints may depend on thresholds
+        if (constraints != nullptr) {
+          constraints->UpdateIndices(dir, bias, t);
+        }
+
+        // when the algorithm goes through the thresholds we use the same index for cumulative arrays
+        // in both directions but each leaf is constrained according to the corresponding array
+        // the threshold is included in the left leaf
+        double current_gain = GetSplitGains(
+            sum_left_gradient, sum_left_hessian, sum_right_gradient,
+            sum_right_hessian, meta_->config->lambda_l1,
+            meta_->config->lambda_l2, meta_->config->max_delta_step,
+            constraints, meta_->monotone_type);
         // gain with split is worse than without split
         if (current_gain <= min_gain_shift) continue;
 
@@ -560,6 +645,10 @@ class FeatureHistogram {
           // left is <= threshold, right is > threshold.  so this is t-1
           best_threshold = static_cast<uint32_t>(t - 1 + bias);
           best_gain = current_gain;
+
+          if (constraints != nullptr) {
+            best_constraints.Update(constraints);
+          }
         }
       }
     } else {
@@ -582,6 +671,10 @@ class FeatureHistogram {
         t = -1;
       }
 
+      if (constraints != nullptr) {
+        constraints->InitializeIndices(dir);
+      }
+
       for (; t <= t_end; ++t) {
         // need to skip default bin
         if (skip_default_bin && (t + bias) == static_cast<int>(meta_->default_bin)) { continue; }
@@ -602,10 +695,15 @@ class FeatureHistogram {
         if (sum_right_hessian < meta_->config->min_sum_hessian_in_leaf) break;
 
         double sum_right_gradient = sum_gradient - sum_left_gradient;
-        // current split gain
-        double current_gain = GetSplitGains(sum_left_gradient, sum_left_hessian, sum_right_gradient, sum_right_hessian,
-                                            meta_->config->lambda_l1, meta_->config->lambda_l2, meta_->config->max_delta_step,
-                                            min_constraint, max_constraint, meta_->monotone_type);
+
+        if (constraints != nullptr) {
+          constraints->UpdateIndices(1, bias, t);
+        }
+        double current_gain = GetSplitGains(
+            sum_left_gradient, sum_left_hessian, sum_right_gradient,
+            sum_right_hessian, meta_->config->lambda_l1,
+            meta_->config->lambda_l2, meta_->config->max_delta_step,
+            constraints, meta_->monotone_type);
         // gain with split is worse than without split
         if (current_gain <= min_gain_shift) continue;
 
@@ -618,6 +716,10 @@ class FeatureHistogram {
           best_sum_left_hessian = sum_left_hessian;
           best_threshold = static_cast<uint32_t>(t + bias);
           best_gain = current_gain;
+
+          if (constraints != nullptr) {
+            best_constraints.Update(constraints);
+          }
         }
       }
     }
@@ -625,21 +727,24 @@ class FeatureHistogram {
     if (is_splittable_ && best_gain > output->gain) {
       // update split information
       output->threshold = best_threshold;
-      output->left_output = CalculateSplittedLeafOutput(best_sum_left_gradient, best_sum_left_hessian,
-                                                        meta_->config->lambda_l1, meta_->config->lambda_l2, meta_->config->max_delta_step,
-                                                        min_constraint, max_constraint);
+      output->left_output = CalculateSplittedLeafOutput(
+          best_sum_left_gradient, best_sum_left_hessian,
+          meta_->config->lambda_l1, meta_->config->lambda_l2,
+          meta_->config->max_delta_step, &best_constraints.left);
       output->left_count = best_left_count;
       output->left_sum_gradient = best_sum_left_gradient;
       output->left_sum_hessian = best_sum_left_hessian - kEpsilon;
-      output->right_output = CalculateSplittedLeafOutput(sum_gradient - best_sum_left_gradient,
-                                                         sum_hessian - best_sum_left_hessian,
-                                                         meta_->config->lambda_l1, meta_->config->lambda_l2, meta_->config->max_delta_step,
-                                                         min_constraint, max_constraint);
+      output->right_output = CalculateSplittedLeafOutput(
+          sum_gradient - best_sum_left_gradient,
+          sum_hessian - best_sum_left_hessian, meta_->config->lambda_l1,
+          meta_->config->lambda_l2, meta_->config->max_delta_step,
+          &best_constraints.right);
       output->right_count = num_data - best_left_count;
       output->right_sum_gradient = sum_gradient - best_sum_left_gradient;
       output->right_sum_hessian = sum_hessian - best_sum_left_hessian - kEpsilon;
       output->gain = best_gain;
       output->default_left = dir == -1;
+
     }
   }
 
@@ -649,7 +754,8 @@ class FeatureHistogram {
   // std::vector<HistogramBinEntry> data_;
   bool is_splittable_ = true;
 
-  std::function<void(double, double, data_size_t, double, double, SplitInfo*)> find_best_threshold_fun_;
+  std::function<void(double, double, data_size_t, SplitInfo *,
+                     SplittingConstraints *)> find_best_threshold_fun_;
 };
 class HistogramPool {
  public:
diff --git a/src/treelearner/feature_parallel_tree_learner.cpp b/src/treelearner/feature_parallel_tree_learner.cpp
index 745ca44be68b..d5e6c013b73d 100644
--- a/src/treelearner/feature_parallel_tree_learner.cpp
+++ b/src/treelearner/feature_parallel_tree_learner.cpp
@@ -52,8 +52,11 @@ void FeatureParallelTreeLearner<TREELEARNER_T>::BeforeTrain() {
 }
 
 template <typename TREELEARNER_T>
-void FeatureParallelTreeLearner<TREELEARNER_T>::FindBestSplitsFromHistograms(const std::vector<int8_t>& is_feature_used, bool use_subtract) {
-  TREELEARNER_T::FindBestSplitsFromHistograms(is_feature_used, use_subtract);
+void FeatureParallelTreeLearner<TREELEARNER_T>::FindBestSplitsFromHistograms(
+    const std::vector<int8_t> &is_feature_used, bool use_subtract,
+    const Tree *tree) {
+  TREELEARNER_T::FindBestSplitsFromHistograms(is_feature_used, use_subtract,
+                                              tree);
   SplitInfo smaller_best_split, larger_best_split;
   // get best split at smaller leaf
   smaller_best_split = this->best_split_per_leaf_[this->smaller_leaf_splits_->LeafIndex()];
diff --git a/src/treelearner/gpu_tree_learner.cpp b/src/treelearner/gpu_tree_learner.cpp
index f279fdc7331e..12fbf746ec87 100644
--- a/src/treelearner/gpu_tree_learner.cpp
+++ b/src/treelearner/gpu_tree_learner.cpp
@@ -1085,6 +1085,7 @@ void GPUTreeLearner::Split(Tree* tree, int best_Leaf, int* left_leaf, int* right
       double smaller_max = smaller_leaf_splits_->max_constraint();
       double larger_min = larger_leaf_splits_->min_constraint();
       double larger_max = larger_leaf_splits_->max_constraint();
+      // FIXME This part of the code has not been updated
       smaller_leaf_splits_->Init(*right_leaf, data_partition_.get(), best_split_info.right_sum_gradient, best_split_info.right_sum_hessian);
       larger_leaf_splits_->Init(*left_leaf, data_partition_.get(), best_split_info.left_sum_gradient, best_split_info.left_sum_hessian);
       smaller_leaf_splits_->SetValueConstraint(smaller_min, smaller_max);
diff --git a/src/treelearner/leaf_splits.hpp b/src/treelearner/leaf_splits.hpp
index e46d0b846fcb..b63560340c53 100644
--- a/src/treelearner/leaf_splits.hpp
+++ b/src/treelearner/leaf_splits.hpp
@@ -38,27 +38,22 @@ class LeafSplits {
   * \param sum_gradients
   * \param sum_hessians
   */
-  void Init(int leaf, const DataPartition* data_partition, double sum_gradients, double sum_hessians) {
+  void Init(int leaf, const DataPartition *data_partition, double sum_gradients,
+            double sum_hessians, int depth) {
+    depth_ = depth;
     leaf_index_ = leaf;
     data_indices_ = data_partition->GetIndexOnLeaf(leaf, &num_data_in_leaf_);
     sum_gradients_ = sum_gradients;
     sum_hessians_ = sum_hessians;
-    min_val_ = -std::numeric_limits<double>::max();
-    max_val_ = std::numeric_limits<double>::max();
   }
 
-  void SetValueConstraint(double min, double max) {
-    min_val_ = min;
-    max_val_ = max;
-  }
-
-
   /*!
   * \brief Init splits on current leaf, it will traverse all data to sum up the results
   * \param gradients
   * \param hessians
   */
   void Init(const score_t* gradients, const score_t* hessians) {
+    depth_ = 0;
     num_data_in_leaf_ = num_data_;
     leaf_index_ = 0;
     data_indices_ = nullptr;
@@ -71,8 +66,6 @@ class LeafSplits {
     }
     sum_gradients_ = tmp_sum_gradients;
     sum_hessians_ = tmp_sum_hessians;
-    min_val_ = -std::numeric_limits<double>::max();
-    max_val_ = std::numeric_limits<double>::max();
   }
 
   /*!
@@ -82,7 +75,9 @@ class LeafSplits {
   * \param gradients
   * \param hessians
   */
-  void Init(int leaf, const DataPartition* data_partition, const score_t* gradients, const score_t* hessians) {
+  void Init(int leaf, const DataPartition *data_partition,
+            const score_t *gradients, const score_t *hessians, int depth) {
+    depth_ = depth;
     leaf_index_ = leaf;
     data_indices_ = data_partition->GetIndexOnLeaf(leaf, &num_data_in_leaf_);
     double tmp_sum_gradients = 0.0f;
@@ -95,8 +90,6 @@ class LeafSplits {
     }
     sum_gradients_ = tmp_sum_gradients;
     sum_hessians_ = tmp_sum_hessians;
-    min_val_ = -std::numeric_limits<double>::max();
-    max_val_ = std::numeric_limits<double>::max();
   }
 
 
@@ -106,22 +99,20 @@ class LeafSplits {
   * \param sum_hessians
   */
   void Init(double sum_gradients, double sum_hessians) {
+    depth_ = 0;
     leaf_index_ = 0;
     sum_gradients_ = sum_gradients;
     sum_hessians_ = sum_hessians;
-    min_val_ = -std::numeric_limits<double>::max();
-    max_val_ = std::numeric_limits<double>::max();
   }
 
   /*!
   * \brief Init splits on current leaf
   */
   void Init() {
+    depth_ = 0;
     leaf_index_ = -1;
     data_indices_ = nullptr;
     num_data_in_leaf_ = 0;
-    min_val_ = -std::numeric_limits<double>::max();
-    max_val_ = std::numeric_limits<double>::max();
   }
 
 
@@ -137,9 +128,7 @@ class LeafSplits {
   /*! \brief Get sum of hessians of current leaf */
   double sum_hessians() const { return sum_hessians_; }
 
-  double max_constraint() const { return max_val_; }
-
-  double min_constraint() const { return min_val_; }
+  int depth() const { return depth_; }
 
   /*! \brief Get indices of data of current leaf */
   const data_size_t* data_indices() const { return data_indices_; }
@@ -158,8 +147,7 @@ class LeafSplits {
   double sum_hessians_;
   /*! \brief indices of data of current leaf */
   const data_size_t* data_indices_;
-  double min_val_;
-  double max_val_;
+  int depth_;
 };
 
 }  // namespace LightGBM
diff --git a/src/treelearner/monotone_constraints.cpp b/src/treelearner/monotone_constraints.cpp
new file mode 100644
index 000000000000..7442063747c2
--- /dev/null
+++ b/src/treelearner/monotone_constraints.cpp
@@ -0,0 +1,368 @@
+#include "monotone_constraints.hpp"
+#include "serial_tree_learner.h"
+#include "feature_histogram.hpp"
+#include "cost_effective_gradient_boosting.hpp"
+
+namespace LightGBM {
+
+void LeafConstraints::SetChildrenConstraintsFastMethod(
+    std::vector<LeafConstraints> &constraints_per_leaf, int *right_leaf,
+    int *left_leaf, int8_t monotone_type, double right_output,
+    double left_output, bool is_numerical_split) {
+  constraints_per_leaf[*right_leaf] = constraints_per_leaf[*left_leaf];
+  if (is_numerical_split) {
+    // depending on the monotone type we set constraints on the future splits
+    // these constraints may be updated later in the algorithm
+    if (monotone_type < 0) {
+      constraints_per_leaf[*left_leaf].SetMinConstraint(right_output);
+      constraints_per_leaf[*right_leaf].SetMaxConstraint(left_output);
+    } else if (monotone_type > 0) {
+      constraints_per_leaf[*left_leaf].SetMaxConstraint(right_output);
+      constraints_per_leaf[*right_leaf].SetMinConstraint(left_output);
+    }
+  }
+}
+
+// this function goes through the tree to find how the split that
+// has just been performed is going to affect the constraints of other leaves
+void LeafConstraints::GoUpToFindLeavesToUpdate(
+    int node_idx, std::vector<int> &features, std::vector<uint32_t> &thresholds,
+    std::vector<bool> &is_in_right_split, int split_feature,
+    const SplitInfo &split_info, double previous_leaf_output,
+    uint32_t split_threshold, std::vector<SplitInfo> &best_split_per_leaf_,
+    const std::vector<int8_t> &is_feature_used_, int num_threads_,
+    int num_features_, HistogramPool &histogram_pool_,
+    LearnerState &learner_state) {
+  int parent_idx = learner_state.tree->node_parent(node_idx);
+  if (parent_idx != -1) {
+    int inner_feature = learner_state.tree->split_feature_inner(parent_idx);
+    int8_t monotone_type =
+        learner_state.train_data_->FeatureMonotone(inner_feature);
+    bool is_right_split =
+        learner_state.tree->right_child(parent_idx) == node_idx;
+    bool split_contains_new_information = true;
+    bool is_split_numerical =
+        learner_state.train_data_->FeatureBinMapper(inner_feature)
+            ->bin_type() == BinType::NumericalBin;
+
+    // only branches containing leaves that are contiguous to the original leaf
+    // need to be updated
+    for (unsigned int i = 0; i < features.size(); ++i) {
+      if ((features[i] == inner_feature && is_split_numerical) &&
+          (is_in_right_split[i] == is_right_split)) {
+        split_contains_new_information = false;
+        break;
+      }
+    }
+
+    if (split_contains_new_information) {
+      if (monotone_type != 0) {
+        int left_child_idx = learner_state.tree->left_child(parent_idx);
+        int right_child_idx = learner_state.tree->right_child(parent_idx);
+        bool left_child_is_curr_idx = (left_child_idx == node_idx);
+        int node_idx_to_pass =
+            (left_child_is_curr_idx) ? right_child_idx : left_child_idx;
+        bool take_min = (monotone_type < 0) ? left_child_is_curr_idx
+                                            : !left_child_is_curr_idx;
+
+        GoDownToFindLeavesToUpdate(
+            node_idx_to_pass, features, thresholds, is_in_right_split, take_min,
+            split_feature, split_info, previous_leaf_output, true, true,
+            split_threshold, best_split_per_leaf_, is_feature_used_,
+            num_threads_, num_features_, histogram_pool_, learner_state);
+      }
+
+      is_in_right_split.push_back(learner_state.tree->right_child(parent_idx) ==
+                                  node_idx);
+      thresholds.push_back(learner_state.tree->threshold_in_bin(parent_idx));
+      features.push_back(learner_state.tree->split_feature_inner(parent_idx));
+    }
+
+    if (parent_idx != 0) {
+      LeafConstraints::GoUpToFindLeavesToUpdate(
+          parent_idx, features, thresholds, is_in_right_split, split_feature,
+          split_info, previous_leaf_output, split_threshold,
+          best_split_per_leaf_, is_feature_used_, num_threads_, num_features_,
+          histogram_pool_, learner_state);
+    }
+  }
+}
+
+// this function goes through the tree to find how the split that was just made
+// is
+// going to affect other leaves
+void LeafConstraints::GoDownToFindLeavesToUpdate(
+    int node_idx, const std::vector<int> &features,
+    const std::vector<uint32_t> &thresholds,
+    const std::vector<bool> &is_in_right_split, int maximum, int split_feature,
+    const SplitInfo &split_info, double previous_leaf_output,
+    bool use_left_leaf, bool use_right_leaf, uint32_t split_threshold,
+    std::vector<SplitInfo> &best_split_per_leaf_,
+    const std::vector<int8_t> &is_feature_used_, int num_threads_,
+    int num_features_, HistogramPool &histogram_pool_,
+    LearnerState &learner_state) {
+  if (node_idx < 0) {
+    int leaf_idx = ~node_idx;
+
+    // if leaf is at max depth then there is no need to update it
+    int max_depth = learner_state.config_->max_depth;
+    if (learner_state.tree->leaf_depth(leaf_idx) >= max_depth &&
+        max_depth > 0) {
+      return;
+    }
+
+    // splits that are not to be used shall not be updated
+    if (best_split_per_leaf_[leaf_idx].gain == kMinScore) {
+      return;
+    }
+
+    std::pair<double, double> min_max_constraints;
+    bool something_changed;
+    if (use_right_leaf && use_left_leaf) {
+      min_max_constraints =
+          std::minmax(split_info.right_output, split_info.left_output);
+    } else if (use_right_leaf && !use_left_leaf) {
+      min_max_constraints = std::pair<double, double>(split_info.right_output,
+                                                      split_info.right_output);
+    } else {
+      min_max_constraints = std::pair<double, double>(split_info.left_output,
+                                                      split_info.left_output);
+    }
+
+#ifdef DEBUG
+    if (maximum) {
+      CHECK(min_max_constraints.first >=
+            learner_state.tree->LeafOutput(leaf_idx));
+    } else {
+      CHECK(min_max_constraints.second <=
+            learner_state.tree->LeafOutput(leaf_idx));
+    }
+#endif
+
+    if (!learner_state.config_->monotone_precise_mode) {
+      if (!maximum) {
+        something_changed =
+            learner_state.constraints_per_leaf_[leaf_idx]
+                .SetMinConstraintAndReturnChange(min_max_constraints.second);
+      } else {
+        something_changed =
+            learner_state.constraints_per_leaf_[leaf_idx]
+                .SetMaxConstraintAndReturnChange(min_max_constraints.first);
+      }
+      if (!something_changed) {
+        return;
+      }
+    } else {
+      if (!maximum) {
+        // both functions need to be called in this order
+        // because they modify the struct
+        something_changed =
+            learner_state.constraints_per_leaf_[leaf_idx]
+                .CrossesMinConstraint(min_max_constraints.second);
+        something_changed = learner_state.constraints_per_leaf_[leaf_idx]
+                                .IsInMinConstraints(previous_leaf_output) ||
+                            something_changed;
+      } else {
+        // both functions need to be called in this order
+        // because they modify the struct
+        something_changed =
+            learner_state.constraints_per_leaf_[leaf_idx]
+                .CrossesMaxConstraint(min_max_constraints.first);
+        something_changed = learner_state.constraints_per_leaf_[leaf_idx]
+                                .IsInMaxConstraints(previous_leaf_output) ||
+                            something_changed;
+      }
+      // if constraints have changed, then best splits need to be updated
+      // otherwise, we can just continue and go to the next split
+      if (!something_changed) {
+        return;
+      }
+    }
+    UpdateBestSplitsFromHistograms(
+        best_split_per_leaf_[leaf_idx], leaf_idx,
+        learner_state.tree->leaf_depth(leaf_idx), is_feature_used_,
+        num_threads_, num_features_, histogram_pool_, learner_state);
+  } else {
+    // check if the children are contiguous with the original leaf
+    std::pair<bool, bool> keep_going_left_right = ShouldKeepGoingLeftRight(
+        learner_state.tree, node_idx, features, thresholds, is_in_right_split,
+        learner_state.train_data_);
+    int inner_feature = learner_state.tree->split_feature_inner(node_idx);
+    uint32_t threshold = learner_state.tree->threshold_in_bin(node_idx);
+    bool is_split_numerical =
+        learner_state.train_data_->FeatureBinMapper(inner_feature)
+            ->bin_type() == BinType::NumericalBin;
+    bool use_left_leaf_for_update = true;
+    bool use_right_leaf_for_update = true;
+    if (is_split_numerical && inner_feature == split_feature) {
+      if (threshold >= split_threshold) {
+        use_left_leaf_for_update = false;
+      }
+      if (threshold <= split_threshold) {
+        use_right_leaf_for_update = false;
+      }
+    }
+
+    if (keep_going_left_right.first) {
+      GoDownToFindLeavesToUpdate(
+          learner_state.tree->left_child(node_idx), features, thresholds,
+          is_in_right_split, maximum, split_feature, split_info,
+          previous_leaf_output, use_left_leaf,
+          use_right_leaf_for_update && use_right_leaf, split_threshold,
+          best_split_per_leaf_, is_feature_used_, num_threads_, num_features_,
+          histogram_pool_, learner_state);
+    }
+    if (keep_going_left_right.second) {
+      GoDownToFindLeavesToUpdate(
+          learner_state.tree->right_child(node_idx), features, thresholds,
+          is_in_right_split, maximum, split_feature, split_info,
+          previous_leaf_output, use_left_leaf_for_update && use_left_leaf,
+          use_right_leaf, split_threshold, best_split_per_leaf_,
+          is_feature_used_, num_threads_, num_features_, histogram_pool_,
+          learner_state);
+    }
+  }
+}
+
+// this function checks if the original leaf and the children of the node that
+// is
+// currently being visited are contiguous, and if so, the children should be
+// visited too
+std::pair<bool, bool> LeafConstraints::ShouldKeepGoingLeftRight(
+    const Tree *tree, int node_idx, const std::vector<int> &features,
+    const std::vector<uint32_t> &thresholds,
+    const std::vector<bool> &is_in_right_split, const Dataset *train_data_) {
+  int inner_feature = tree->split_feature_inner(node_idx);
+  uint32_t threshold = tree->threshold_in_bin(node_idx);
+  bool is_split_numerical = train_data_->FeatureBinMapper(inner_feature)
+                                ->bin_type() == BinType::NumericalBin;
+
+  bool keep_going_right = true;
+  bool keep_going_left = true;
+  // we check if the left and right node are contiguous with the original leaf
+  // if so we should keep going down these nodes to update constraints
+  for (unsigned int i = 0; i < features.size(); ++i) {
+    if (features[i] == inner_feature) {
+      if (is_split_numerical) {
+        if (threshold >= thresholds[i] && !is_in_right_split[i]) {
+          keep_going_right = false;
+        }
+        if (threshold <= thresholds[i] && is_in_right_split[i]) {
+          keep_going_left = false;
+        }
+      }
+    }
+  }
+  return std::pair<bool, bool>(keep_going_left, keep_going_right);
+}
+
+// this function updates the best split for each leaf
+// it is called only when monotone constraints exist
+void LeafConstraints::UpdateBestSplitsFromHistograms(
+    SplitInfo &split, int leaf, int depth,
+    const std::vector<int8_t> &is_feature_used_, int num_threads_,
+    int num_features_, HistogramPool &histogram_pool_,
+    LearnerState &learner_state) {
+  std::vector<SplitInfo> bests(num_threads_);
+  std::vector<bool> should_split_be_worse(num_threads_, false);
+
+  // the feature histogram is retrieved
+  FeatureHistogram *histogram_array_;
+  histogram_pool_.Get(leaf, &histogram_array_);
+
+  OMP_INIT_EX();
+#pragma omp parallel for schedule(static, 1024) if (num_features_ >= 2048)
+  for (int feature_index = 0; feature_index < num_features_; ++feature_index) {
+    OMP_LOOP_EX_BEGIN();
+    // the feature that are supposed to be used are computed
+    if (!is_feature_used_[feature_index])
+      continue;
+    if (!histogram_array_[feature_index].is_splittable()) {
+      continue;
+    }
+
+    // loop through the features to find the best one just like in the
+    // FindBestSplitsFromHistograms function
+    const int tid = omp_get_thread_num();
+    int real_fidx = learner_state.train_data_->RealFeatureIndex(feature_index);
+
+    // if the monotone precise mode is disabled or if the constraints have to be
+    // updated,
+    // but are not exclusively worse, then we update the constraints and the
+    // best split
+    if (!learner_state.config_->monotone_precise_mode ||
+        (learner_state.constraints_per_leaf_[leaf].ToBeUpdated(feature_index) &&
+         !learner_state.constraints_per_leaf_[leaf]
+              .AreActualConstraintsWorse(feature_index))) {
+
+      SerialTreeLearner::ComputeBestSplitForFeature(
+          split.left_sum_gradient + split.right_sum_gradient,
+          split.left_sum_hessian + split.right_sum_hessian,
+          split.left_count + split.right_count, feature_index, histogram_array_,
+          bests, leaf, depth, tid, real_fidx, learner_state, true);
+    } else {
+      if (learner_state.cegb_->GetSplitInfo(
+              leaf * learner_state.train_data_->num_features() +
+              feature_index) > bests[tid]) {
+        bests[tid] = learner_state.cegb_->GetSplitInfo(
+            leaf * learner_state.train_data_->num_features() + feature_index);
+        should_split_be_worse[tid] =
+            learner_state.constraints_per_leaf_[leaf]
+                .AreActualConstraintsWorse(feature_index);
+      }
+    }
+
+    OMP_LOOP_EX_END();
+  }
+  OMP_THROW_EX();
+
+  auto best_idx = ArrayArgs<SplitInfo>::ArgMax(bests);
+  // if the best split that has been found previously actually doesn't have the
+  // true constraints
+  // but worse ones that were not computed before to optimize the computation
+  // time,
+  // then we update every split and every constraints that should be updated
+  if (should_split_be_worse[best_idx]) {
+    std::fill(bests.begin(), bests.end(), SplitInfo());
+#pragma omp parallel for schedule(static, 1024) if (num_features_ >= 2048)
+    for (int feature_index = 0; feature_index < num_features_;
+         ++feature_index) {
+      OMP_LOOP_EX_BEGIN();
+      if (!is_feature_used_[feature_index])
+        continue;
+      if (!histogram_array_[feature_index].is_splittable()) {
+        continue;
+      }
+
+      const int tid = omp_get_thread_num();
+      int real_fidx =
+          learner_state.train_data_->RealFeatureIndex(feature_index);
+
+      if (learner_state.constraints_per_leaf_[leaf]
+              .AreActualConstraintsWorse(feature_index)) {
+        ;
+      } else {
+#ifdef DEBUG
+        CHECK(!learner_state.constraints_per_leaf_[leaf]
+                   .ToBeUpdated(feature_index));
+#endif
+        if (learner_state.cegb_->GetSplitInfo(
+                leaf * learner_state.train_data_->num_features() +
+                feature_index) > bests[tid]) {
+          bests[tid] = learner_state.cegb_->GetSplitInfo(
+              leaf * learner_state.train_data_->num_features() + feature_index);
+        }
+      }
+
+      OMP_LOOP_EX_END();
+    }
+    OMP_THROW_EX();
+    best_idx = ArrayArgs<SplitInfo>::ArgMax(bests);
+  }
+
+  // note: the gains may differ for the same set of constraints due to the
+  // non-deterministic OMP reduction.
+  split = bests[best_idx];
+}
+
+} // namespace LightGBM
diff --git a/src/treelearner/monotone_constraints.hpp b/src/treelearner/monotone_constraints.hpp
new file mode 100644
index 000000000000..a5fa8f2a4cc3
--- /dev/null
+++ b/src/treelearner/monotone_constraints.hpp
@@ -0,0 +1,639 @@
+#ifndef LIGHTGBM_TREELEARNER_MONOTONE_CONSTRAINTS_H_
+#define LIGHTGBM_TREELEARNER_MONOTONE_CONSTRAINTS_H_
+
+#include <vector>
+#include <LightGBM/network.h>
+#include "split_info.hpp"
+#include <LightGBM/tree.h>
+#include "data_partition.hpp"
+
+namespace LightGBM {
+
+struct CostEfficientGradientBoosting;
+struct CurrentConstraints;
+class HistogramPool;
+struct LeafConstraints;
+
+struct LearnerState {
+  const Config *config_;
+  const DataPartition *data_partition_;
+  const Dataset *train_data_;
+  std::vector<LeafConstraints> &constraints_per_leaf_;
+  const Tree *tree;
+  CurrentConstraints &current_constraints;
+  std::unique_ptr<CostEfficientGradientBoosting> &cegb_;
+
+  LearnerState(const Config *config_,
+               const DataPartition *data_partition_,
+               const Dataset *train_data_,
+               std::vector<LeafConstraints> &constraints_per_leaf_,
+               const Tree *tree, CurrentConstraints &current_constraints,
+               std::unique_ptr<CostEfficientGradientBoosting> &cegb_)
+      : config_(config_), data_partition_(data_partition_),
+        train_data_(train_data_), constraints_per_leaf_(constraints_per_leaf_),
+        tree(tree), current_constraints(current_constraints), cegb_(cegb_) {};
+};
+
+// the purpose of this structure is to store the constraints for one leaf
+// when the monotone precise mode is disabled, then it will just store
+// one min and one max constraint
+// but if the monotone precise mode is enabled, then it may store a
+// large number of constraints for different thresholds and features
+struct LeafConstraints {
+  std::vector<std::vector<double> > min_constraints;
+  std::vector<std::vector<double> > max_constraints;
+  // the constraint number i is valid on the slice
+  // [thresholds[i]:threshold[i+1])
+  // if threshold[i+1] does not exist, then it is valid for thresholds following
+  // threshold[i]
+  std::vector<std::vector<uint32_t> > min_thresholds;
+  std::vector<std::vector<uint32_t> > max_thresholds;
+  // These 2 vectors keep track of which constraints over which features
+  // have to be upated
+  std::vector<bool> min_to_be_updated;
+  std::vector<bool> max_to_be_updated;
+  // This vector keeps track of the constraints that we didn't update for some
+  // features, because they could only be worse, and another better split was
+  // available, so we didn't need to compute them yet, but we may need to in the
+  // future
+  std::vector<bool> are_actual_constraints_worse;
+
+  static void SetChildrenConstraintsFastMethod(
+      std::vector<LeafConstraints> &constraints_per_leaf, int *right_leaf,
+      int *left_leaf, int8_t monotone_type, double right_output,
+      double left_output, bool is_numerical_split);
+
+  static void GoUpToFindLeavesToUpdate(
+      int node_idx, std::vector<int> &features,
+      std::vector<uint32_t> &thresholds, std::vector<bool> &is_in_right_split,
+      int split_feature, const SplitInfo &split_info,
+      double previous_leaf_output, uint32_t split_threshold,
+      std::vector<SplitInfo> &best_split_per_leaf_,
+      const std::vector<int8_t> &is_feature_used_, int num_threads_,
+      int num_features_, HistogramPool &histogram_pool_,
+      LearnerState &learner_state);
+
+  static void GoUpToFindLeavesToUpdate(
+      int node_idx, int split_feature, const SplitInfo &split_info,
+      double previous_leaf_output, uint32_t split_threshold,
+      std::vector<SplitInfo> &best_split_per_leaf_,
+      const std::vector<int8_t> &is_feature_used_, int num_threads_,
+      int num_features_, HistogramPool &histogram_pool_,
+      LearnerState &learner_state) {
+    int depth = learner_state.tree->leaf_depth(
+                    ~learner_state.tree->left_child(node_idx)) -
+                1;
+
+    std::vector<int> features;
+    std::vector<uint32_t> thresholds;
+    std::vector<bool> is_in_right_split;
+
+    features.reserve(depth);
+    thresholds.reserve(depth);
+    is_in_right_split.reserve(depth);
+
+    GoUpToFindLeavesToUpdate(node_idx, features, thresholds, is_in_right_split,
+                             split_feature, split_info, previous_leaf_output,
+                             split_threshold, best_split_per_leaf_,
+                             is_feature_used_, num_threads_, num_features_,
+                             histogram_pool_, learner_state);
+  }
+
+  static void GoDownToFindLeavesToUpdate(
+      int node_idx, const std::vector<int> &features,
+      const std::vector<uint32_t> &thresholds,
+      const std::vector<bool> &is_in_right_split, int maximum,
+      int split_feature, const SplitInfo &split_info,
+      double previous_leaf_output, bool use_left_leaf, bool use_right_leaf,
+      uint32_t split_threshold, std::vector<SplitInfo> &best_split_per_leaf_,
+      const std::vector<int8_t> &is_feature_used_, int num_threads_,
+      int num_features_, HistogramPool &histogram_pool_,
+      LearnerState &learner_state);
+
+  static std::pair<bool, bool> ShouldKeepGoingLeftRight(
+      const Tree *tree, int node_idx, const std::vector<int> &features,
+      const std::vector<uint32_t> &thresholds,
+      const std::vector<bool> &is_in_right_split, const Dataset *train_data_);
+
+  static void
+  UpdateBestSplitsFromHistograms(SplitInfo &split, int leaf, int depth,
+                                 const std::vector<int8_t> &is_feature_used_,
+                                 int num_threads_, int num_features_,
+                                 HistogramPool &histogram_pool_,
+                                 LearnerState &learner_state);
+
+  bool IsInConstraints(double element,
+                       const std::vector<std::vector<double> > &constraints,
+                       std::vector<bool> &to_be_updated) {
+    bool ret = false;
+    for (unsigned int i = 0; i < constraints.size(); i++) {
+      for (unsigned int j = 0; j < constraints[i].size(); j++) {
+        if (element == constraints[i][j]) {
+          ret = true;
+          to_be_updated[i] = true;
+          are_actual_constraints_worse[i] = false;
+        }
+      }
+    }
+    return ret;
+  }
+
+  bool IsInMinConstraints(double min) {
+    return IsInConstraints(min, min_constraints, min_to_be_updated);
+  }
+
+  bool IsInMaxConstraints(double max) {
+    return IsInConstraints(max, max_constraints, max_to_be_updated);
+  }
+
+  void SetConstraint(double element,
+                     std::vector<std::vector<double> > &constraints,
+                     bool is_operator_greater) const {
+    for (unsigned int i = 0; i < constraints.size(); i++) {
+      for (unsigned int j = 0; j < constraints[i].size(); j++) {
+        if ((is_operator_greater && element > constraints[i][j]) ||
+            (!is_operator_greater && element < constraints[i][j])) {
+          constraints[i][j] = element;
+        }
+      }
+    }
+  }
+
+  // this function is the same as the previous one, but it also returns
+  // if it actually modified something or not
+  bool
+  SetConstraintAndReturnChange(double element,
+                               std::vector<std::vector<double> > &constraints,
+                               bool is_operator_greater) const {
+    bool something_changed = false;
+    for (unsigned int i = 0; i < constraints.size(); i++) {
+      for (unsigned int j = 0; j < constraints[i].size(); j++) {
+        if ((is_operator_greater && element > constraints[i][j]) ||
+            (!is_operator_greater && element < constraints[i][j])) {
+          constraints[i][j] = element;
+          something_changed = true;
+        }
+      }
+    }
+    return something_changed;
+  }
+
+  // this function checks if the element passed as a parameter would actually update
+  // the constraints if they were to be set it as an additional constraint
+  bool CrossesConstraint(double element,
+                         std::vector<std::vector<double> > &constraints,
+                         bool is_operator_greater,
+                         std::vector<bool> &to_be_updated) {
+    bool ret = false;
+    for (unsigned int i = 0; i < constraints.size(); i++) {
+      for (unsigned int j = 0; j < constraints[i].size(); j++) {
+        if ((is_operator_greater && element > constraints[i][j]) ||
+            (!is_operator_greater && element < constraints[i][j])) {
+          ret = true;
+          to_be_updated[i] = true;
+          are_actual_constraints_worse[i] = true;
+        }
+      }
+    }
+    return ret;
+  }
+
+  bool SetMinConstraintAndReturnChange(double min) {
+    return SetConstraintAndReturnChange(min, min_constraints, true);
+  }
+
+  bool SetMaxConstraintAndReturnChange(double max) {
+    return SetConstraintAndReturnChange(max, max_constraints, false);
+  }
+
+  void SetMinConstraint(double min) {
+    SetConstraint(min, min_constraints, true);
+  }
+
+  void SetMaxConstraint(double max) {
+    SetConstraint(max, max_constraints, false);
+  }
+
+  bool CrossesMinConstraint(double min) {
+    return CrossesConstraint(min, min_constraints, true, min_to_be_updated);
+  }
+
+  bool CrossesMaxConstraint(double max) {
+    return CrossesConstraint(max, max_constraints, false, max_to_be_updated);
+  }
+
+  void ResetUpdates(unsigned int i) {
+#ifdef DEBUG
+    CHECK(i < are_actual_constraints_worse.size());
+#endif
+    are_actual_constraints_worse[i] = false;
+    min_to_be_updated[i] = false;
+    max_to_be_updated[i] = false;
+  }
+
+  // when the monotone precise mode is disabled, then we can just store
+  // 1 min and 1 max constraints per leaf, so we call this constructor
+  LeafConstraints() {
+    min_constraints.push_back(
+        std::vector<double>(1, -std::numeric_limits<double>::max()));
+    max_constraints.push_back(
+        std::vector<double>(1, std::numeric_limits<double>::max()));
+    min_thresholds.push_back(std::vector<uint32_t>(1, 0));
+    max_thresholds.push_back(std::vector<uint32_t>(1, 0));
+  }
+
+  // when the monotone precise mode is enabled, then for each feature,
+  // we need to sort an array of constraints
+  LeafConstraints(unsigned int num_features) {
+    min_constraints.resize(num_features);
+    max_constraints.resize(num_features);
+
+    min_thresholds.resize(num_features);
+    max_thresholds.resize(num_features);
+
+    min_to_be_updated.resize(num_features, false);
+    max_to_be_updated.resize(num_features, false);
+    are_actual_constraints_worse.resize(num_features, false);
+
+    for (unsigned int i = 0; i < num_features; i++) {
+      // The number 32 has no real meaning here, but during our experiments,
+      // we found that the number of constraints per feature was well below 32, so by
+      // allocating this space, we may save some time because we won't have to allocate it later
+      min_constraints[i].reserve(32);
+      max_constraints[i].reserve(32);
+
+      min_thresholds[i].reserve(32);
+      max_thresholds[i].reserve(32);
+
+      min_constraints[i].push_back(-std::numeric_limits<double>::max());
+      max_constraints[i].push_back(std::numeric_limits<double>::max());
+
+      min_thresholds[i].push_back(0);
+      max_thresholds[i].push_back(0);
+    }
+  }
+
+  bool AreActualConstraintsWorse(unsigned int feature_idx) const {
+    return are_actual_constraints_worse[feature_idx];
+  }
+
+  bool ToBeUpdated(unsigned int feature_idx) const {
+    return min_to_be_updated[feature_idx] || max_to_be_updated[feature_idx];
+  }
+
+  bool MinToBeUpdated(unsigned int feature_idx) const {
+    return min_to_be_updated[feature_idx];
+  }
+
+  bool MaxToBeUpdated(unsigned int feature_idx) const {
+    return max_to_be_updated[feature_idx];
+  }
+
+  LeafConstraints(const LeafConstraints &constraints)
+      : min_constraints(constraints.min_constraints),
+        max_constraints(constraints.max_constraints),
+        min_thresholds(constraints.min_thresholds),
+        max_thresholds(constraints.max_thresholds),
+        min_to_be_updated(constraints.min_to_be_updated),
+        max_to_be_updated(constraints.max_to_be_updated),
+        are_actual_constraints_worse(constraints.are_actual_constraints_worse) {
+  }
+
+  // When we reset the constraints, then we just need to write that the constraints
+  // are +/- inf, starting from the threshold 0
+  void Reset() {
+    for (unsigned int i = 0; i < min_constraints.size(); i++) {
+      min_constraints[i].resize(1);
+      max_constraints[i].resize(1);
+      min_thresholds[i].resize(1);
+      max_thresholds[i].resize(1);
+
+      min_constraints[i][0] = -std::numeric_limits<double>::max();
+      max_constraints[i][0] = std::numeric_limits<double>::max();
+      min_thresholds[i][0] = 0;
+      max_thresholds[i][0] = 0;
+    }
+  }
+
+  static double ComputeMonotoneSplitGainPenalty(int depth, double penalization,
+                                                double epsilon = 1e-10) {
+    if (penalization >= depth + 1.) {
+      return epsilon;
+    }
+    if (penalization <= 1.) {
+      return 1. - penalization / pow(2., depth) + epsilon;
+    }
+    return 1. - pow(2, penalization - 1. - depth) + epsilon;
+  }
+};
+
+struct SplittingConstraint {
+  std::vector<double> cumulative_min_constraint;
+  std::vector<double> cumulative_max_constraint;
+
+  unsigned int index_min_constraint;
+  unsigned int index_max_constraint;
+
+  SplittingConstraint() {
+    index_min_constraint = 0;
+    index_max_constraint = 0;
+
+    cumulative_min_constraint = std::vector<double>(1, -std::numeric_limits<double>::max());
+    cumulative_max_constraint = std::vector<double>(1, std::numeric_limits<double>::max());
+  }
+
+  SplittingConstraint(std::vector<double> cumulative_min_constraint,
+                      std::vector<double> cumulative_max_constraint) {
+    this->cumulative_min_constraint = cumulative_min_constraint;
+    this->cumulative_max_constraint = cumulative_max_constraint;
+  }
+
+  double GetCurrentMinConstraint() const {
+    return cumulative_min_constraint[index_min_constraint];
+  }
+
+  double GetCurrentMaxConstraint() const {
+    return cumulative_max_constraint[index_max_constraint];
+  }
+
+  void Reserve(int space_to_reserve) {
+    cumulative_max_constraint.reserve(space_to_reserve);
+    cumulative_min_constraint.reserve(space_to_reserve);
+  }
+
+  void InitializeConstraints() {
+    cumulative_max_constraint.resize(1);
+    cumulative_min_constraint.resize(1);
+    cumulative_min_constraint[0] = -std::numeric_limits<double>::max();
+    cumulative_max_constraint[0] = std::numeric_limits<double>::max();
+  }
+
+  void InitializeIndices(int dir, int min_size, int max_size) {
+    if (dir == -1) {
+      index_min_constraint = min_size;
+      index_max_constraint = max_size;
+    } else {
+      index_min_constraint = 0;
+      index_max_constraint = 0;
+    }
+  }
+
+  void Set(const LeafConstraints &leaf_constraints) {
+    cumulative_min_constraint[0] = leaf_constraints.min_constraints[0][0];
+    cumulative_max_constraint[0] = leaf_constraints.max_constraints[0][0];
+  }
+};
+
+struct SplittingConstraints {
+  SplittingConstraint right;
+  SplittingConstraint left;
+
+  std::vector<double> cumulative_min_constraint_right_to_left;
+  std::vector<double> cumulative_max_constraint_right_to_left;
+  std::vector<double> cumulative_min_constraint_left_to_right;
+  std::vector<double> cumulative_max_constraint_left_to_right;
+
+  std::vector<uint32_t> thresholds_min_constraints;
+  std::vector<uint32_t> thresholds_max_constraints;
+
+  unsigned int index_min_constraint_left_to_right;
+  unsigned int index_min_constraint_right_to_left;
+  unsigned int index_max_constraint_left_to_right;
+  unsigned int index_max_constraint_right_to_left;
+  bool update_is_necessary;
+
+  SplittingConstraints() {};
+
+  SplittingConstraints(
+      std::vector<double> &cumulative_min_constraint_right_to_left,
+      std::vector<double> &cumulative_min_constraint_left_to_right,
+      std::vector<double> &cumulative_max_constraint_right_to_left,
+      std::vector<double> &cumulative_max_constraint_left_to_right,
+      std::vector<uint32_t> &thresholds_min_constraints,
+      std::vector<uint32_t> &thresholds_max_constraints) {
+    right = SplittingConstraint(cumulative_min_constraint_right_to_left, cumulative_max_constraint_right_to_left);
+    left = SplittingConstraint(cumulative_min_constraint_left_to_right, cumulative_max_constraint_left_to_right);
+
+    this->thresholds_min_constraints = thresholds_min_constraints;
+    this->thresholds_max_constraints = thresholds_max_constraints;
+  }
+
+  static void CumulativeExtremum(
+      const double &(*extremum_function)(const double &, const double &),
+      bool is_direction_from_left_to_right,
+      std::vector<double> &cumulative_extremum) {
+    if (cumulative_extremum.size() == 1) {
+      return;
+    }
+#ifdef DEBUG
+    CHECK(cumulative_extremum.size() != 0);
+#endif
+
+    std::size_t n_exts = cumulative_extremum.size();
+    int step = is_direction_from_left_to_right ? 1 : -1;
+    std::size_t start = is_direction_from_left_to_right ? 0 : n_exts - 1;
+    std::size_t end = is_direction_from_left_to_right ? n_exts - 1 : 0;
+
+    for (auto i = start; i != end; i = i + step) {
+      cumulative_extremum[i + step] = extremum_function(
+          cumulative_extremum[i + step], cumulative_extremum[i]);
+    }
+  }
+
+  void ComputeCumulativeExtremums() {
+    const double &(*min)(const double &, const double &) = std::min<double>;
+    const double &(*max)(const double &, const double &) = std::max<double>;
+
+    CumulativeExtremum(max, true, left.cumulative_min_constraint);
+    CumulativeExtremum(max, false, right.cumulative_min_constraint);
+    CumulativeExtremum(min, true, left.cumulative_max_constraint);
+    CumulativeExtremum(min, false, right.cumulative_max_constraint);
+  }
+
+  void InitializeIndices(int dir) {
+    right.InitializeIndices(dir, thresholds_min_constraints.size() - 1, thresholds_max_constraints.size() - 1);
+    left.InitializeIndices(dir, thresholds_min_constraints.size() - 1, thresholds_max_constraints.size() - 1);
+    if (dir == -1) {
+      update_is_necessary = !(thresholds_max_constraints.size() == 1 &&
+                              thresholds_min_constraints.size() == 1);
+    }
+  }
+
+  void UpdateIndices(int dir, const int8_t bias, int t) {
+    if (dir == -1) {
+      if (update_is_necessary) {
+        while (
+            static_cast<int>(
+                thresholds_min_constraints[index_min_constraint_left_to_right]) >
+            t + bias - 1) {
+          left.index_min_constraint -= 1;
+        }
+        while (
+            static_cast<int>(
+                thresholds_min_constraints[index_min_constraint_right_to_left]) >
+            t + bias) {
+          right.index_min_constraint -= 1;
+        }
+        while (
+            static_cast<int>(
+                thresholds_max_constraints[index_max_constraint_left_to_right]) >
+            t + bias - 1) {
+          left.index_max_constraint -= 1;
+        }
+        while (
+            static_cast<int>(
+                thresholds_max_constraints[index_max_constraint_right_to_left]) >
+            t + bias) {
+          right.index_max_constraint -= 1;
+        }
+      }
+#ifdef DEBUG
+      CHECK(left.index_min_constraint <
+            thresholds_min_constraint.size());
+      CHECK(right.index_min_constraint <
+            thresholds_min_constraint.size());
+      CHECK(left.index_max_constraint <
+            thresholds_max_constraint.size());
+      CHECK(right.index_max_constraint <
+            thresholds_max_constraint.size());
+#endif
+    } else {
+// current split gain
+#ifdef DEBUG
+      CHECK(left.index_min_constraint <
+            thresholds_min_constraint.size());
+      CHECK(right.index_min_constraint <
+            thresholds_min_constraint.size());
+      CHECK(left.index_max_constraint <
+            thresholds_max_constraint.size());
+      CHECK(right.index_max_constraint <
+            thresholds_max_constraint.size());
+#endif
+    }
+  }
+
+  void Reserve(int space_to_reserve) {
+    right.Reserve(space_to_reserve);
+    left.Reserve(space_to_reserve);
+    thresholds_max_constraints.reserve(space_to_reserve);
+    thresholds_min_constraints.reserve(space_to_reserve);
+  }
+
+  void InitializeConstraints() {
+    thresholds_min_constraints.resize(1);
+    thresholds_max_constraints.resize(1);
+
+    thresholds_min_constraints[0] = 0;
+    thresholds_max_constraints[0] = 0;
+
+    right.InitializeConstraints();
+    left.InitializeConstraints();
+  }
+
+  void Set(const LeafConstraints &leaf_constraints) {
+    right.Set(leaf_constraints);
+    left.Set(leaf_constraints);
+    thresholds_min_constraints[0] = leaf_constraints.min_thresholds[0][0];
+    thresholds_max_constraints[0] = leaf_constraints.max_thresholds[0][0];
+  }
+
+  void CheckCoherenceWithLeafOutput(double leaf_output,
+                                    double EPS) {
+    CHECK(left.cumulative_min_constraint == right.cumulative_min_constraint);
+    CHECK(left.cumulative_max_constraint == right.cumulative_max_constraint);
+    for (const auto &x : left.cumulative_max_constraint) {
+      CHECK(leaf_output <= EPS + x);
+      CHECK(x > -std::numeric_limits<double>::max());
+    }
+    for (const auto &x : right.cumulative_min_constraint) {
+      CHECK(leaf_output + EPS >= x);
+      CHECK(x < std::numeric_limits<double>::max());
+    }
+  }
+};
+
+struct CurrentConstraints {
+  std::vector<SplittingConstraints> splitting_constraints_vector;
+
+  const int space_to_reserve_non_monotone_precise_mode;
+  const int space_to_reserve_monotone_precise_mode;
+
+  // the number 32 has no real meaning here, but during our experiments,
+  // we found that the number of constraints per feature was well below 32, so
+  // by allocating this space, we may save some time because we won't have to
+  // allocate it later
+  CurrentConstraints()
+      : space_to_reserve_non_monotone_precise_mode(1),
+        space_to_reserve_monotone_precise_mode(32) {};
+
+  void Init(int num_threads_, const Config *config_) {
+    splitting_constraints_vector.resize(num_threads_);
+
+    int space_to_reserve = space_to_reserve_monotone_precise_mode;
+    if (!config_->monotone_precise_mode) {
+      space_to_reserve = space_to_reserve_non_monotone_precise_mode;
+    }
+
+    for (int i = 0; i < num_threads_; ++i) {
+      splitting_constraints_vector[i].Reserve(space_to_reserve);
+      InitializeConstraints(i);
+    }
+  }
+
+  SplittingConstraints& operator[](unsigned int i) {
+    return splitting_constraints_vector[i];
+  }
+
+  // initializing constraints is just writing that the constraints should +/-
+  // inf from threshold 0
+  void InitializeConstraints(unsigned int tid) {
+    splitting_constraints_vector[tid].InitializeConstraints();
+  }
+
+  void Set(const LeafConstraints &leaf_constraints, unsigned int tid) {
+    splitting_constraints_vector[tid].Set(leaf_constraints);
+  }
+
+  void CheckCoherenceWithLeafOutput(double leaf_output, unsigned int tid,
+                                    double EPS) {
+    splitting_constraints_vector[tid]
+        .CheckCoherenceWithLeafOutput(leaf_output, EPS);
+  }
+};
+
+struct BestConstraint {
+  double best_min_constraint;
+  double best_max_constraint;
+
+  BestConstraint() {
+    best_min_constraint = -std::numeric_limits<double>::max();
+    best_max_constraint = std::numeric_limits<double>::max();
+  }
+
+  void Update(const SplittingConstraint &constraint) {
+    best_min_constraint = constraint.GetCurrentMinConstraint();
+    best_max_constraint = constraint.GetCurrentMaxConstraint();
+  }
+
+  // Named after SplittingConstraint to make a template to group the 2
+  double GetCurrentMinConstraint() const {
+    return best_min_constraint;
+  }
+
+  double GetCurrentMaxConstraint() const {
+    return best_max_constraint;
+  }
+};
+
+struct BestConstraints {
+  BestConstraint right;
+  BestConstraint left;
+
+  void Update(SplittingConstraints *constraints) {
+    right.Update(constraints->right);
+    left.Update(constraints->left);
+  }
+};
+
+} // namespace LightGBM
+#endif // LightGBM_TREELEARNER_MONOTONE_CONSTRAINTS_H_
diff --git a/src/treelearner/parallel_tree_learner.h b/src/treelearner/parallel_tree_learner.h
index c6754b517397..92a67cdbe5ea 100644
--- a/src/treelearner/parallel_tree_learner.h
+++ b/src/treelearner/parallel_tree_learner.h
@@ -31,7 +31,9 @@ class FeatureParallelTreeLearner: public TREELEARNER_T {
 
  protected:
   void BeforeTrain() override;
-  void FindBestSplitsFromHistograms(const std::vector<int8_t>& is_feature_used, bool use_subtract) override;
+  void FindBestSplitsFromHistograms(const std::vector<int8_t> &is_feature_used,
+                                    bool use_subtract,
+                                    const Tree *tree) override;
 
  private:
   /*! \brief rank of local machine */
@@ -59,8 +61,10 @@ class DataParallelTreeLearner: public TREELEARNER_T {
 
  protected:
   void BeforeTrain() override;
-  void FindBestSplits() override;
-  void FindBestSplitsFromHistograms(const std::vector<int8_t>& is_feature_used, bool use_subtract) override;
+  void FindBestSplits(const Tree *tree) override;
+  void FindBestSplitsFromHistograms(const std::vector<int8_t> &is_feature_used,
+                                    bool use_subtract,
+                                    const Tree *tree) override;
   void Split(Tree* tree, int best_Leaf, int* left_leaf, int* right_leaf) override;
 
   inline data_size_t GetGlobalDataCountInLeaf(int leaf_idx) const override {
@@ -114,8 +118,10 @@ class VotingParallelTreeLearner: public TREELEARNER_T {
  protected:
   void BeforeTrain() override;
   bool BeforeFindBestSplit(const Tree* tree, int left_leaf, int right_leaf) override;
-  void FindBestSplits() override;
-  void FindBestSplitsFromHistograms(const std::vector<int8_t>& is_feature_used, bool use_subtract) override;
+  void FindBestSplits(const Tree *tree) override;
+  void FindBestSplitsFromHistograms(const std::vector<int8_t> &is_feature_used,
+                                    bool use_subtract,
+                                    const Tree *tree) override;
   void Split(Tree* tree, int best_Leaf, int* left_leaf, int* right_leaf) override;
 
   inline data_size_t GetGlobalDataCountInLeaf(int leaf_idx) const override {
diff --git a/src/treelearner/serial_tree_learner.cpp b/src/treelearner/serial_tree_learner.cpp
index e3531039ec9d..1f358ac8aa3e 100644
--- a/src/treelearner/serial_tree_learner.cpp
+++ b/src/treelearner/serial_tree_learner.cpp
@@ -25,6 +25,7 @@ std::chrono::duration<double, std::milli> hist_time;
 std::chrono::duration<double, std::milli> find_split_time;
 std::chrono::duration<double, std::milli> split_time;
 std::chrono::duration<double, std::milli> ordered_bin_time;
+std::chrono::duration<double, std::milli> refit_leaves_time;
 #endif  // TIMETAG
 
 SerialTreeLearner::SerialTreeLearner(const Config* config)
@@ -45,6 +46,7 @@ SerialTreeLearner::~SerialTreeLearner() {
   Log::Info("SerialTreeLearner::find_split costs %f", find_split_time * 1e-3);
   Log::Info("SerialTreeLearner::split costs %f", split_time * 1e-3);
   Log::Info("SerialTreeLearner::ordered_bin costs %f", ordered_bin_time * 1e-3);
+  Log::Info("SerialTreeLearner::refit_leaves costs %f", refit_leaves_time * 1e-3);
   #endif
 }
 
@@ -71,6 +73,16 @@ void SerialTreeLearner::Init(const Dataset* train_data, bool is_constant_hessian
   histogram_pool_.DynamicChangeSize(train_data_, config_, max_cache_size, config_->num_leaves);
   // push split information for all leaves
   best_split_per_leaf_.resize(config_->num_leaves);
+
+  // when the monotone precise mode is enabled, we need to store
+  // more constraints; hence the constructors are different
+  if (config_->monotone_precise_mode) {
+    ;
+  } else {
+    constraints_per_leaf_.resize(config_->num_leaves,
+                                 LeafConstraints());
+  }
+
   // get ordered bin
   train_data_->CreateOrderedBins(&ordered_bins_);
 
@@ -108,6 +120,8 @@ void SerialTreeLearner::Init(const Dataset* train_data, bool is_constant_hessian
     cegb_.reset(new CostEfficientGradientBoosting(this));
     cegb_->Init();
   }
+
+  current_constraints.Init(num_threads_, config_);
 }
 
 void SerialTreeLearner::ResetTrainingData(const Dataset* train_data) {
@@ -208,7 +222,7 @@ Tree* SerialTreeLearner::Train(const score_t* gradients, const score_t *hessians
       init_split_time += std::chrono::steady_clock::now() - start_time;
       #endif
       // find best threshold for every feature
-      FindBestSplits();
+      FindBestSplits(tree.get());
     } else if (aborted_last_force_split) {
       aborted_last_force_split = false;
     }
@@ -236,6 +250,7 @@ Tree* SerialTreeLearner::Train(const score_t* gradients, const score_t *hessians
   return tree.release();
 }
 
+
 Tree* SerialTreeLearner::FitByExistingTree(const Tree* old_tree, const score_t* gradients, const score_t *hessians) const {
   auto tree = std::unique_ptr<Tree>(new Tree(*old_tree));
   CHECK(data_partition_->num_leaves() >= tree->num_leaves());
@@ -337,6 +352,7 @@ void SerialTreeLearner::BeforeTrain() {
   // reset the splits for leaves
   for (int i = 0; i < config_->num_leaves; ++i) {
     best_split_per_leaf_[i].Reset();
+    constraints_per_leaf_[i].Reset();
   }
 
   // Sumup for root
@@ -346,7 +362,7 @@ void SerialTreeLearner::BeforeTrain() {
 
   } else {
     // use bagging, only use part of data
-    smaller_leaf_splits_->Init(0, data_partition_.get(), gradients_, hessians_);
+    smaller_leaf_splits_->Init(0, data_partition_.get(), gradients_, hessians_, 0.);
   }
 
   larger_leaf_splits_->Init();
@@ -476,7 +492,7 @@ bool SerialTreeLearner::BeforeFindBestSplit(const Tree* tree, int left_leaf, int
   return true;
 }
 
-void SerialTreeLearner::FindBestSplits() {
+void SerialTreeLearner::FindBestSplits(const Tree* tree) {
   std::vector<int8_t> is_feature_used(num_features_, 0);
   #pragma omp parallel for schedule(static, 1024) if (num_features_ >= 2048)
   for (int feature_index = 0; feature_index < num_features_; ++feature_index) {
@@ -490,7 +506,7 @@ void SerialTreeLearner::FindBestSplits() {
   }
   bool use_subtract = parent_leaf_histogram_array_ != nullptr;
   ConstructHistograms(is_feature_used, use_subtract);
-  FindBestSplitsFromHistograms(is_feature_used, use_subtract);
+  FindBestSplitsFromHistograms(is_feature_used, use_subtract, tree);
 }
 
 void SerialTreeLearner::ConstructHistograms(const std::vector<int8_t>& is_feature_used, bool use_subtract) {
@@ -521,10 +537,15 @@ void SerialTreeLearner::ConstructHistograms(const std::vector<int8_t>& is_featur
   #endif
 }
 
-void SerialTreeLearner::FindBestSplitsFromHistograms(const std::vector<int8_t>& is_feature_used, bool use_subtract) {
+void SerialTreeLearner::FindBestSplitsFromHistograms(
+    const std::vector<int8_t> &is_feature_used, bool use_subtract,
+    const Tree *tree) {
   #ifdef TIMETAG
   auto start_time = std::chrono::steady_clock::now();
   #endif
+    LearnerState learner_state(config_, data_partition_.get(), train_data_,
+                               constraints_per_leaf_, tree, current_constraints,
+                               cegb_);
   std::vector<SplitInfo> smaller_best(num_threads_);
   std::vector<SplitInfo> larger_best(num_threads_);
   std::vector<int8_t> smaller_node_used_features(num_features_, 1);
@@ -540,26 +561,21 @@ void SerialTreeLearner::FindBestSplitsFromHistograms(const std::vector<int8_t>&
     OMP_LOOP_EX_BEGIN();
     if (!is_feature_used[feature_index]) { continue; }
     const int tid = omp_get_thread_num();
-    SplitInfo smaller_split;
+
     train_data_->FixHistogram(feature_index,
                               smaller_leaf_splits_->sum_gradients(), smaller_leaf_splits_->sum_hessians(),
                               smaller_leaf_splits_->num_data_in_leaf(),
                               smaller_leaf_histogram_array_[feature_index].RawData());
     int real_fidx = train_data_->RealFeatureIndex(feature_index);
-    smaller_leaf_histogram_array_[feature_index].FindBestThreshold(
-      smaller_leaf_splits_->sum_gradients(),
-      smaller_leaf_splits_->sum_hessians(),
-      smaller_leaf_splits_->num_data_in_leaf(),
-      smaller_leaf_splits_->min_constraint(),
-      smaller_leaf_splits_->max_constraint(),
-      &smaller_split);
-    smaller_split.feature = real_fidx;
-    if (cegb_ != nullptr) {
-      smaller_split.gain -= cegb_->DetlaGain(feature_index, real_fidx, smaller_leaf_splits_->LeafIndex(), smaller_leaf_splits_->num_data_in_leaf(), smaller_split);
-    }
-    if (smaller_split > smaller_best[tid] && smaller_node_used_features[feature_index]) {
-      smaller_best[tid] = smaller_split;
-    }
+
+    ComputeBestSplitForFeature(smaller_leaf_splits_->sum_gradients(),
+                               smaller_leaf_splits_->sum_hessians(),
+                               smaller_leaf_splits_->num_data_in_leaf(),
+                               feature_index, smaller_leaf_histogram_array_,
+                               smaller_best, smaller_leaf_splits_->LeafIndex(),
+                               smaller_leaf_splits_->depth(), tid, real_fidx,
+                               learner_state);
+
     // only has root leaf
     if (larger_leaf_splits_ == nullptr || larger_leaf_splits_->LeafIndex() < 0) { continue; }
 
@@ -570,22 +586,15 @@ void SerialTreeLearner::FindBestSplitsFromHistograms(const std::vector<int8_t>&
                                 larger_leaf_splits_->num_data_in_leaf(),
                                 larger_leaf_histogram_array_[feature_index].RawData());
     }
-    SplitInfo larger_split;
-    // find best threshold for larger child
-    larger_leaf_histogram_array_[feature_index].FindBestThreshold(
-      larger_leaf_splits_->sum_gradients(),
-      larger_leaf_splits_->sum_hessians(),
-      larger_leaf_splits_->num_data_in_leaf(),
-      larger_leaf_splits_->min_constraint(),
-      larger_leaf_splits_->max_constraint(),
-      &larger_split);
-    larger_split.feature = real_fidx;
-    if (cegb_ != nullptr) {
-      larger_split.gain -= cegb_->DetlaGain(feature_index, real_fidx, larger_leaf_splits_->LeafIndex(), larger_leaf_splits_->num_data_in_leaf(), larger_split);
-    }
-    if (larger_split > larger_best[tid] && larger_node_used_features[feature_index]) {
-      larger_best[tid] = larger_split;
-    }
+
+    ComputeBestSplitForFeature(larger_leaf_splits_->sum_gradients(),
+                               larger_leaf_splits_->sum_hessians(),
+                               larger_leaf_splits_->num_data_in_leaf(),
+                               feature_index, larger_leaf_histogram_array_,
+                               larger_best, larger_leaf_splits_->LeafIndex(),
+                               larger_leaf_splits_->depth(), tid, real_fidx,
+                               learner_state);
+
     OMP_LOOP_EX_END();
   }
   OMP_THROW_EX();
@@ -620,7 +629,7 @@ int32_t SerialTreeLearner::ForceSplits(Tree* tree, const Json& forced_split_json
     // before processing next node from queue, store info for current left/right leaf
     // store "best split" for left and right, even if they might be overwritten by forced split
     if (BeforeFindBestSplit(tree, *left_leaf, *right_leaf)) {
-      FindBestSplits();
+      FindBestSplits(tree);
     }
     // then, compute own splits
     SplitInfo left_split;
@@ -679,6 +688,11 @@ int32_t SerialTreeLearner::ForceSplits(Tree* tree, const Json& forced_split_json
     SplitInfo current_split_info = forceSplitMap[current_leaf];
     const int inner_feature_index = train_data_->InnerFeatureIndex(
             current_split_info.feature);
+    // we want to know if the feature has to be monotone
+    bool feature_is_monotone = false;
+    if (!config_->monotone_constraints.empty()) {
+        feature_is_monotone = config_->monotone_constraints[inner_feature_index] != 0;
+    }
     auto threshold_double = train_data_->RealThreshold(
             inner_feature_index, current_split_info.threshold);
 
@@ -698,7 +712,8 @@ int32_t SerialTreeLearner::ForceSplits(Tree* tree, const Json& forced_split_json
                                 static_cast<double>(current_split_info.right_sum_hessian),
                                 static_cast<float>(current_split_info.gain),
                                 train_data_->FeatureBinMapper(inner_feature_index)->missing_type(),
-                                current_split_info.default_left);
+                                current_split_info.default_left,
+                                feature_is_monotone);
       data_partition_->Split(current_leaf, train_data_, inner_feature_index,
                              &current_split_info.threshold, 1,
                              current_split_info.default_left, *right_leaf);
@@ -726,26 +741,33 @@ int32_t SerialTreeLearner::ForceSplits(Tree* tree, const Json& forced_split_json
                                            static_cast<double>(current_split_info.left_sum_hessian),
                                            static_cast<double>(current_split_info.right_sum_hessian),
                                            static_cast<float>(current_split_info.gain),
-                                           train_data_->FeatureBinMapper(inner_feature_index)->missing_type());
+                                           train_data_->FeatureBinMapper(inner_feature_index)->missing_type(),
+                                           feature_is_monotone);
       data_partition_->Split(current_leaf, train_data_, inner_feature_index,
                              cat_bitset_inner.data(), static_cast<int>(cat_bitset_inner.size()),
                              current_split_info.default_left, *right_leaf);
     }
 
+    int depth = tree->leaf_depth(*left_leaf);
+    #ifdef DEBUG
+    CHECK(depth == tree->leaf_depth(*right_leaf));
+    #endif
     if (current_split_info.left_count < current_split_info.right_count) {
       left_smaller = true;
       smaller_leaf_splits_->Init(*left_leaf, data_partition_.get(),
                                  current_split_info.left_sum_gradient,
-                                 current_split_info.left_sum_hessian);
+                                 current_split_info.left_sum_hessian, depth);
       larger_leaf_splits_->Init(*right_leaf, data_partition_.get(),
                                 current_split_info.right_sum_gradient,
-                                current_split_info.right_sum_hessian);
+                                current_split_info.right_sum_hessian, depth);
     } else {
       left_smaller = false;
       smaller_leaf_splits_->Init(*right_leaf, data_partition_.get(),
-                                 current_split_info.right_sum_gradient, current_split_info.right_sum_hessian);
+                                 current_split_info.right_sum_gradient,
+                                 current_split_info.right_sum_hessian, depth);
       larger_leaf_splits_->Init(*left_leaf, data_partition_.get(),
-                                current_split_info.left_sum_gradient, current_split_info.left_sum_hessian);
+                                current_split_info.left_sum_gradient,
+                                current_split_info.left_sum_hessian, depth);
     }
 
     left = Json();
@@ -770,6 +792,7 @@ int32_t SerialTreeLearner::ForceSplits(Tree* tree, const Json& forced_split_json
 
 void SerialTreeLearner::Split(Tree* tree, int best_leaf, int* left_leaf, int* right_leaf) {
   const SplitInfo& best_split_info = best_split_per_leaf_[best_leaf];
+  double previous_leaf_output = tree->LeafOutput(best_leaf);
   const int inner_feature_index = train_data_->InnerFeatureIndex(best_split_info.feature);
   if (cegb_ != nullptr) {
     cegb_->UpdateLeafBestSplits(tree, best_leaf, &best_split_info, &best_split_per_leaf_);
@@ -793,7 +816,8 @@ void SerialTreeLearner::Split(Tree* tree, int best_leaf, int* left_leaf, int* ri
                               static_cast<double>(best_split_info.right_sum_hessian),
                               static_cast<float>(best_split_info.gain),
                               train_data_->FeatureBinMapper(inner_feature_index)->missing_type(),
-                              best_split_info.default_left);
+                              best_split_info.default_left,
+                              best_split_info.monotone_type != 0);
     data_partition_->Split(best_leaf, train_data_, inner_feature_index,
                            &best_split_info.threshold, 1, best_split_info.default_left, *right_leaf);
   } else {
@@ -817,7 +841,8 @@ void SerialTreeLearner::Split(Tree* tree, int best_leaf, int* left_leaf, int* ri
                                          static_cast<double>(best_split_info.left_sum_hessian),
                                          static_cast<double>(best_split_info.right_sum_hessian),
                                          static_cast<float>(best_split_info.gain),
-                                         train_data_->FeatureBinMapper(inner_feature_index)->missing_type());
+                                         train_data_->FeatureBinMapper(inner_feature_index)->missing_type(),
+                                         best_split_info.monotone_type != 0);
     data_partition_->Split(best_leaf, train_data_, inner_feature_index,
                            cat_bitset_inner.data(), static_cast<int>(cat_bitset_inner.size()), best_split_info.default_left, *right_leaf);
   }
@@ -825,33 +850,52 @@ void SerialTreeLearner::Split(Tree* tree, int best_leaf, int* left_leaf, int* ri
   #ifdef DEBUG
   CHECK(best_split_info.left_count == data_partition_->leaf_count(best_leaf));
   #endif
-  auto p_left = smaller_leaf_splits_.get();
-  auto p_right = larger_leaf_splits_.get();
   // init the leaves that used on next iteration
+  int depth = tree->leaf_depth(*left_leaf);
+  #ifdef DEBUG
+  CHECK(depth == tree->leaf_depth(*right_leaf));
+  #endif
   if (best_split_info.left_count < best_split_info.right_count) {
-    smaller_leaf_splits_->Init(*left_leaf, data_partition_.get(), best_split_info.left_sum_gradient, best_split_info.left_sum_hessian);
-    larger_leaf_splits_->Init(*right_leaf, data_partition_.get(), best_split_info.right_sum_gradient, best_split_info.right_sum_hessian);
+    smaller_leaf_splits_->Init(*left_leaf, data_partition_.get(),
+                               best_split_info.left_sum_gradient,
+                               best_split_info.left_sum_hessian, depth);
+    larger_leaf_splits_->Init(*right_leaf, data_partition_.get(),
+                              best_split_info.right_sum_gradient,
+                              best_split_info.right_sum_hessian, depth);
   } else {
-    smaller_leaf_splits_->Init(*right_leaf, data_partition_.get(), best_split_info.right_sum_gradient, best_split_info.right_sum_hessian);
-    larger_leaf_splits_->Init(*left_leaf, data_partition_.get(), best_split_info.left_sum_gradient, best_split_info.left_sum_hessian);
-    p_right = smaller_leaf_splits_.get();
-    p_left = larger_leaf_splits_.get();
-  }
-  p_left->SetValueConstraint(best_split_info.min_constraint, best_split_info.max_constraint);
-  p_right->SetValueConstraint(best_split_info.min_constraint, best_split_info.max_constraint);
-  if (is_numerical_split) {
-    double mid = (best_split_info.left_output + best_split_info.right_output) / 2.0f;
-    if (best_split_info.monotone_type < 0) {
-      p_left->SetValueConstraint(mid, best_split_info.max_constraint);
-      p_right->SetValueConstraint(best_split_info.min_constraint, mid);
-    } else if (best_split_info.monotone_type > 0) {
-      p_left->SetValueConstraint(best_split_info.min_constraint, mid);
-      p_right->SetValueConstraint(mid, best_split_info.max_constraint);
-    }
+    smaller_leaf_splits_->Init(*right_leaf, data_partition_.get(),
+                               best_split_info.right_sum_gradient,
+                               best_split_info.right_sum_hessian, depth);
+    larger_leaf_splits_->Init(*left_leaf, data_partition_.get(),
+                              best_split_info.left_sum_gradient,
+                              best_split_info.left_sum_hessian, depth);
+  }
+
+  // when the monotone precise mode is disabled it is very easy to compute the constraints of
+  // the children of a leaf, but when it is enabled, one needs to go through the tree to do so,
+  // and it is done directly before computing best splits
+  if (!config_->monotone_precise_mode) {
+    LeafConstraints::SetChildrenConstraintsFastMethod(
+        constraints_per_leaf_, right_leaf, left_leaf,
+        best_split_info.monotone_type, best_split_info.right_output,
+        best_split_info.left_output, is_numerical_split);
+  }
+
+  // if there is a monotone split above, we need to make sure the new
+  // values don't clash with existing constraints in the subtree,
+  // and if they do, the existing splits need to be updated
+  if (!config_->monotone_constraints.empty() && tree->leaf_is_in_monotone_subtree(*right_leaf)) {
+    LearnerState learner_state(config_, data_partition_.get(), train_data_,
+                               constraints_per_leaf_, tree, current_constraints,
+                               cegb_);
+    LeafConstraints::GoUpToFindLeavesToUpdate(
+        tree->leaf_parent(*right_leaf), inner_feature_index,
+        best_split_info, previous_leaf_output, best_split_info.threshold,
+        best_split_per_leaf_, is_feature_used_,
+        num_threads_, num_features_, histogram_pool_, learner_state);
   }
 }
 
-
 void SerialTreeLearner::RenewTreeOutput(Tree* tree, const ObjectiveFunction* obj, std::function<double(const label_t*, int)> residual_getter,
                                         data_size_t total_num_data, const data_size_t* bag_indices, data_size_t bag_cnt) const {
   if (obj != nullptr && obj->IsRenewTreeOutput()) {
@@ -892,4 +936,56 @@ void SerialTreeLearner::RenewTreeOutput(Tree* tree, const ObjectiveFunction* obj
   }
 }
 
+// this function computes the best split for a given leaf and a given feature
+void SerialTreeLearner::ComputeBestSplitForFeature(
+    double sum_gradient, double sum_hessian, data_size_t num_data,
+    int feature_index, FeatureHistogram *histogram_array_,
+    std::vector<SplitInfo> &bests, int leaf_index, int depth, const int tid,
+    int real_fidx, LearnerState &learner_state, bool update) {
+
+  // if this is not a subtree stemming from a monotone split, then no constraint
+  // apply
+  if (learner_state.tree->leaf_is_in_monotone_subtree(leaf_index)) {
+    if (!learner_state.config_->monotone_precise_mode) {
+      learner_state.current_constraints.Set(learner_state.constraints_per_leaf_[leaf_index],
+                              tid);
+    }
+  }
+
+#ifdef DEBUG
+  learner_state.current_constraints.CheckCoherenceWithLeafOutput(
+      learner_state.tree->LeafOutput(leaf_index), tid, kEpsilon)
+#endif
+      SplitInfo new_split;
+
+  SplittingConstraints *constraints;
+  if (learner_state.config_->monotone_constraints.empty()) {
+    constraints = nullptr;
+  } else {
+    constraints = &learner_state.current_constraints[tid];
+  }
+  histogram_array_[feature_index].FindBestThreshold(
+      sum_gradient, sum_hessian, num_data, &new_split, constraints);
+
+  if (learner_state.tree->leaf_is_in_monotone_subtree(leaf_index)) {
+    learner_state.current_constraints.InitializeConstraints(tid);
+  }
+
+  new_split.feature = real_fidx;
+  if (learner_state.cegb_ != nullptr) {
+      new_split.gain -= learner_state.cegb_->DetlaGain(feature_index, real_fidx, leaf_index, num_data, new_split);
+  }
+
+
+  if (new_split.monotone_type != 0) {
+    double penalty = LeafConstraints::ComputeMonotoneSplitGainPenalty(
+        depth, learner_state.config_->monotone_penalty);
+    new_split.gain *= penalty;
+  }
+
+  if (new_split > bests[tid]) {
+    bests[tid] = new_split;
+  }
+}
+
 }  // namespace LightGBM
diff --git a/src/treelearner/serial_tree_learner.h b/src/treelearner/serial_tree_learner.h
index 31743933a780..1026f0172060 100644
--- a/src/treelearner/serial_tree_learner.h
+++ b/src/treelearner/serial_tree_learner.h
@@ -22,6 +22,7 @@
 #include "feature_histogram.hpp"
 #include "leaf_splits.hpp"
 #include "split_info.hpp"
+#include "monotone_constraints.hpp"
 
 #ifdef USE_GPU
 // Use 4KBytes aligned allocator for ordered gradients and ordered hessians when GPU is enabled.
@@ -79,6 +80,12 @@ class SerialTreeLearner: public TreeLearner {
   void RenewTreeOutput(Tree* tree, const ObjectiveFunction* obj, std::function<double(const label_t*, int)> residual_getter,
                        data_size_t total_num_data, const data_size_t* bag_indices, data_size_t bag_cnt) const override;
 
+  static void ComputeBestSplitForFeature(
+      double sum_gradient, double sum_hessian, data_size_t num_data,
+      int feature_index, FeatureHistogram *histogram_array_,
+      std::vector<SplitInfo> &bests, int leaf_index, int depth, const int tid,
+      int real_fidx, LearnerState &learner_state, bool update = false);
+
  protected:
   virtual std::vector<int8_t> GetUsedFeatures(bool is_tree_level);
   /*!
@@ -91,11 +98,13 @@ class SerialTreeLearner: public TreeLearner {
   */
   virtual bool BeforeFindBestSplit(const Tree* tree, int left_leaf, int right_leaf);
 
-  virtual void FindBestSplits();
+  virtual void FindBestSplits(const Tree* tree);
 
   virtual void ConstructHistograms(const std::vector<int8_t>& is_feature_used, bool use_subtract);
 
-  virtual void FindBestSplitsFromHistograms(const std::vector<int8_t>& is_feature_used, bool use_subtract);
+  virtual void
+  FindBestSplitsFromHistograms(const std::vector<int8_t> &is_feature_used,
+                               bool use_subtract, const Tree *tree);
 
   /*!
   * \brief Partition tree and data according best split.
@@ -145,8 +154,8 @@ class SerialTreeLearner: public TreeLearner {
 
   /*! \brief store best split points for all leaves */
   std::vector<SplitInfo> best_split_per_leaf_;
-  /*! \brief store best split per feature for all leaves */
-  std::vector<SplitInfo> splits_per_leaf_;
+
+  std::vector<LeafConstraints> constraints_per_leaf_;
 
   /*! \brief stores best thresholds for all feature for smaller leaf */
   std::unique_ptr<LeafSplits> smaller_leaf_splits_;
@@ -180,6 +189,8 @@ class SerialTreeLearner: public TreeLearner {
   std::vector<int> ordered_bin_indices_;
   bool is_constant_hessian_;
   std::unique_ptr<CostEfficientGradientBoosting> cegb_;
+
+  CurrentConstraints current_constraints;
 };
 
 inline data_size_t SerialTreeLearner::GetGlobalDataCountInLeaf(int leaf_idx) const {
diff --git a/src/treelearner/split_info.hpp b/src/treelearner/split_info.hpp
index 3afa72a0f4a3..86653522dd04 100644
--- a/src/treelearner/split_info.hpp
+++ b/src/treelearner/split_info.hpp
@@ -48,8 +48,6 @@ struct SplitInfo {
   /*! \brief True if default split is left */
   bool default_left = true;
   int8_t monotone_type = 0;
-  double min_constraint = -std::numeric_limits<double>::max();
-  double max_constraint = std::numeric_limits<double>::max();
   inline static int Size(int max_cat_threshold) {
     return 2 * sizeof(int) + sizeof(uint32_t) + sizeof(bool) + sizeof(double) * 9 + sizeof(data_size_t) * 2 + max_cat_threshold * sizeof(uint32_t) + sizeof(int8_t);
   }
@@ -81,10 +79,6 @@ struct SplitInfo {
     buffer += sizeof(default_left);
     std::memcpy(buffer, &monotone_type, sizeof(monotone_type));
     buffer += sizeof(monotone_type);
-    std::memcpy(buffer, &min_constraint, sizeof(min_constraint));
-    buffer += sizeof(min_constraint);
-    std::memcpy(buffer, &max_constraint, sizeof(max_constraint));
-    buffer += sizeof(max_constraint);
     std::memcpy(buffer, &num_cat_threshold, sizeof(num_cat_threshold));
     buffer += sizeof(num_cat_threshold);
     std::memcpy(buffer, cat_threshold.data(), sizeof(uint32_t) * num_cat_threshold);
@@ -117,10 +111,6 @@ struct SplitInfo {
     buffer += sizeof(default_left);
     std::memcpy(&monotone_type, buffer, sizeof(monotone_type));
     buffer += sizeof(monotone_type);
-    std::memcpy(&min_constraint, buffer, sizeof(min_constraint));
-    buffer += sizeof(min_constraint);
-    std::memcpy(&max_constraint, buffer, sizeof(max_constraint));
-    buffer += sizeof(max_constraint);
     std::memcpy(&num_cat_threshold, buffer, sizeof(num_cat_threshold));
     buffer += sizeof(num_cat_threshold);
     cat_threshold.resize(num_cat_threshold);
diff --git a/src/treelearner/voting_parallel_tree_learner.cpp b/src/treelearner/voting_parallel_tree_learner.cpp
index cb18e3779ba6..265f2546e726 100644
--- a/src/treelearner/voting_parallel_tree_learner.cpp
+++ b/src/treelearner/voting_parallel_tree_learner.cpp
@@ -149,16 +149,26 @@ bool VotingParallelTreeLearner<TREELEARNER_T>::BeforeFindBestSplit(const Tree* t
   if (TREELEARNER_T::BeforeFindBestSplit(tree, left_leaf, right_leaf)) {
     data_size_t num_data_in_left_child = GetGlobalDataCountInLeaf(left_leaf);
     data_size_t num_data_in_right_child = GetGlobalDataCountInLeaf(right_leaf);
+    int depth = tree->leaf_depth(left_leaf);
+    #ifdef DEBUG
+    CHECK(depth == tree->leaf_depth(right_leaf));
+    #endif
     if (right_leaf < 0) {
       return true;
     } else if (num_data_in_left_child < num_data_in_right_child) {
       // get local sumup
-      this->smaller_leaf_splits_->Init(left_leaf, this->data_partition_.get(), this->gradients_, this->hessians_);
-      this->larger_leaf_splits_->Init(right_leaf, this->data_partition_.get(), this->gradients_, this->hessians_);
+      this->smaller_leaf_splits_->Init(left_leaf, this->data_partition_.get(),
+                                       this->gradients_, this->hessians_,
+                                       depth);
+      this->larger_leaf_splits_->Init(right_leaf, this->data_partition_.get(),
+                                      this->gradients_, this->hessians_, depth);
     } else {
       // get local sumup
-      this->smaller_leaf_splits_->Init(right_leaf, this->data_partition_.get(), this->gradients_, this->hessians_);
-      this->larger_leaf_splits_->Init(left_leaf, this->data_partition_.get(), this->gradients_, this->hessians_);
+      this->smaller_leaf_splits_->Init(right_leaf, this->data_partition_.get(),
+                                       this->gradients_, this->hessians_,
+                                       depth);
+      this->larger_leaf_splits_->Init(left_leaf, this->data_partition_.get(),
+                                      this->gradients_, this->hessians_, depth);
     }
     return true;
   } else {
@@ -259,7 +269,7 @@ void VotingParallelTreeLearner<TREELEARNER_T>::CopyLocalHistogram(const std::vec
 }
 
 template <typename TREELEARNER_T>
-void VotingParallelTreeLearner<TREELEARNER_T>::FindBestSplits() {
+void VotingParallelTreeLearner<TREELEARNER_T>::FindBestSplits(const Tree* tree) {
   // use local data to find local best splits
   std::vector<int8_t> is_feature_used(this->num_features_, 0);
 #pragma omp parallel for schedule(static)
@@ -293,13 +303,14 @@ void VotingParallelTreeLearner<TREELEARNER_T>::FindBestSplits() {
       this->smaller_leaf_splits_->num_data_in_leaf(),
       this->smaller_leaf_histogram_array_[feature_index].RawData());
 
-    this->smaller_leaf_histogram_array_[feature_index].FindBestThreshold(
-      this->smaller_leaf_splits_->sum_gradients(),
-      this->smaller_leaf_splits_->sum_hessians(),
-      this->smaller_leaf_splits_->num_data_in_leaf(),
-      this->smaller_leaf_splits_->min_constraint(),
-      this->smaller_leaf_splits_->max_constraint(),
-      &smaller_bestsplit_per_features[feature_index]);
+    // FIXME Fill the vectors with the actual constraints and thresholds
+    SplittingConstraints *constraints;
+    this->smaller_leaf_histogram_array_[feature_index]
+        .FindBestThreshold(this->smaller_leaf_splits_->sum_gradients(),
+                           this->smaller_leaf_splits_->sum_hessians(),
+                           this->smaller_leaf_splits_->num_data_in_leaf(),
+                           &smaller_bestsplit_per_features[feature_index],
+                           constraints);
     smaller_bestsplit_per_features[feature_index].feature = real_feature_index;
     // only has root leaf
     if (this->larger_leaf_splits_ == nullptr || this->larger_leaf_splits_->LeafIndex() < 0) { continue; }
@@ -312,13 +323,13 @@ void VotingParallelTreeLearner<TREELEARNER_T>::FindBestSplits() {
         this->larger_leaf_histogram_array_[feature_index].RawData());
     }
     // find best threshold for larger child
-    this->larger_leaf_histogram_array_[feature_index].FindBestThreshold(
-      this->larger_leaf_splits_->sum_gradients(),
-      this->larger_leaf_splits_->sum_hessians(),
-      this->larger_leaf_splits_->num_data_in_leaf(),
-      this->larger_leaf_splits_->min_constraint(),
-      this->larger_leaf_splits_->max_constraint(),
-      &larger_bestsplit_per_features[feature_index]);
+    // FIXME Fill the vectors with the actual constraints and thresholds
+    this->larger_leaf_histogram_array_[feature_index]
+        .FindBestThreshold(this->larger_leaf_splits_->sum_gradients(),
+                           this->larger_leaf_splits_->sum_hessians(),
+                           this->larger_leaf_splits_->num_data_in_leaf(),
+                           &larger_bestsplit_per_features[feature_index],
+                           constraints);
     larger_bestsplit_per_features[feature_index].feature = real_feature_index;
     OMP_LOOP_EX_END();
   }
@@ -370,11 +381,12 @@ void VotingParallelTreeLearner<TREELEARNER_T>::FindBestSplits() {
   Network::ReduceScatter(input_buffer_.data(), reduce_scatter_size_, sizeof(HistogramBinEntry), block_start_.data(), block_len_.data(),
                          output_buffer_.data(), static_cast<comm_size_t>(output_buffer_.size()), &HistogramBinEntry::SumReducer);
 
-  this->FindBestSplitsFromHistograms(is_feature_used, false);
+  this->FindBestSplitsFromHistograms(is_feature_used, false, tree);
 }
 
 template <typename TREELEARNER_T>
-void VotingParallelTreeLearner<TREELEARNER_T>::FindBestSplitsFromHistograms(const std::vector<int8_t>&, bool) {
+void VotingParallelTreeLearner<TREELEARNER_T>::FindBestSplitsFromHistograms(
+    const std::vector<int8_t> &, bool, const Tree *tree) {
   std::vector<SplitInfo> smaller_bests_per_thread(this->num_threads_);
   std::vector<SplitInfo> larger_best_per_thread(this->num_threads_);
   std::vector<int8_t> smaller_node_used_features(this->num_features_, 1);
@@ -403,13 +415,13 @@ void VotingParallelTreeLearner<TREELEARNER_T>::FindBestSplitsFromHistograms(cons
                                       smaller_leaf_histogram_array_global_[feature_index].RawData());
 
       // find best threshold
+      // FIXME Fill the vectors with the actual constraints and thresholds
+      SplittingConstraints *constraints;
       smaller_leaf_histogram_array_global_[feature_index].FindBestThreshold(
-        smaller_leaf_splits_global_->sum_gradients(),
-        smaller_leaf_splits_global_->sum_hessians(),
-        GetGlobalDataCountInLeaf(smaller_leaf_splits_global_->LeafIndex()),
-        smaller_leaf_splits_global_->min_constraint(),
-        smaller_leaf_splits_global_->max_constraint(),
-        &smaller_split);
+          smaller_leaf_splits_global_->sum_gradients(),
+          smaller_leaf_splits_global_->sum_hessians(),
+          GetGlobalDataCountInLeaf(smaller_leaf_splits_global_->LeafIndex()),
+          &smaller_split, constraints);
       smaller_split.feature = real_feature_index;
       if (smaller_split > smaller_bests_per_thread[tid] && smaller_node_used_features[feature_index]) {
         smaller_bests_per_thread[tid] = smaller_split;
@@ -427,13 +439,14 @@ void VotingParallelTreeLearner<TREELEARNER_T>::FindBestSplitsFromHistograms(cons
                                       larger_leaf_histogram_array_global_[feature_index].RawData());
 
       // find best threshold
+      // FIXME Fill the vectors with the actual constraints and thresholds
+      SplittingConstraints *constraints;
+
       larger_leaf_histogram_array_global_[feature_index].FindBestThreshold(
-        larger_leaf_splits_global_->sum_gradients(),
-        larger_leaf_splits_global_->sum_hessians(),
-        GetGlobalDataCountInLeaf(larger_leaf_splits_global_->LeafIndex()),
-        larger_leaf_splits_global_->min_constraint(),
-        larger_leaf_splits_global_->max_constraint(),
-        &larger_split);
+          larger_leaf_splits_global_->sum_gradients(),
+          larger_leaf_splits_global_->sum_hessians(),
+          GetGlobalDataCountInLeaf(larger_leaf_splits_global_->LeafIndex()),
+          &larger_split, constraints);
       larger_split.feature = real_feature_index;
       if (larger_split > larger_best_per_thread[tid] && larger_node_used_features[feature_index]) {
         larger_best_per_thread[tid] = larger_split;
@@ -477,39 +490,25 @@ void VotingParallelTreeLearner<TREELEARNER_T>::Split(Tree* tree, int best_Leaf,
   // set the global number of data for leaves
   global_data_count_in_leaf_[*left_leaf] = best_split_info.left_count;
   global_data_count_in_leaf_[*right_leaf] = best_split_info.right_count;
-  auto p_left = smaller_leaf_splits_global_.get();
-  auto p_right = larger_leaf_splits_global_.get();
   // init the global sumup info
+  int depth = tree->leaf_depth(*left_leaf);
+  #ifdef DEBUG
+  CHECK(depth == tree->leaf_depth(*right_leaf));
+  #endif
   if (best_split_info.left_count < best_split_info.right_count) {
     smaller_leaf_splits_global_->Init(*left_leaf, this->data_partition_.get(),
-      best_split_info.left_sum_gradient,
-      best_split_info.left_sum_hessian);
+                                      best_split_info.left_sum_gradient,
+                                      best_split_info.left_sum_hessian, depth);
     larger_leaf_splits_global_->Init(*right_leaf, this->data_partition_.get(),
-      best_split_info.right_sum_gradient,
-      best_split_info.right_sum_hessian);
+                                     best_split_info.right_sum_gradient,
+                                     best_split_info.right_sum_hessian, depth);
   } else {
     smaller_leaf_splits_global_->Init(*right_leaf, this->data_partition_.get(),
-      best_split_info.right_sum_gradient,
-      best_split_info.right_sum_hessian);
+                                      best_split_info.right_sum_gradient,
+                                      best_split_info.right_sum_hessian, depth);
     larger_leaf_splits_global_->Init(*left_leaf, this->data_partition_.get(),
-      best_split_info.left_sum_gradient,
-      best_split_info.left_sum_hessian);
-    p_left = larger_leaf_splits_global_.get();
-    p_right = smaller_leaf_splits_global_.get();
-  }
-  const int inner_feature_index = this->train_data_->InnerFeatureIndex(best_split_info.feature);
-  bool is_numerical_split = this->train_data_->FeatureBinMapper(inner_feature_index)->bin_type() == BinType::NumericalBin;
-  p_left->SetValueConstraint(best_split_info.min_constraint, best_split_info.max_constraint);
-  p_right->SetValueConstraint(best_split_info.min_constraint, best_split_info.max_constraint);
-  if (is_numerical_split) {
-    double mid = (best_split_info.left_output + best_split_info.right_output) / 2.0f;
-    if (best_split_info.monotone_type < 0) {
-      p_left->SetValueConstraint(mid, best_split_info.max_constraint);
-      p_right->SetValueConstraint(best_split_info.min_constraint, mid);
-    } else if (best_split_info.monotone_type > 0) {
-      p_left->SetValueConstraint(best_split_info.min_constraint, mid);
-      p_right->SetValueConstraint(mid, best_split_info.max_constraint);
-    }
+                                     best_split_info.left_sum_gradient,
+                                     best_split_info.left_sum_hessian, depth);
   }
 }
 
diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py
index 51c99494e68e..e9e47145bc6a 100644
--- a/tests/python_package_test/test_engine.py
+++ b/tests/python_package_test/test_engine.py
@@ -856,45 +856,200 @@ def test_init_with_subset(self):
         self.assertEqual(subset_data_3.get_data(), "lgb_train_data.bin")
         self.assertEqual(subset_data_4.get_data(), "lgb_train_data.bin")
 
-    def test_monotone_constraint(self):
+    def test_monotone_constraints_categorical_feature(self):
         def is_increasing(y):
             return (np.diff(y) >= 0.0).all()
 
         def is_decreasing(y):
             return (np.diff(y) <= 0.0).all()
 
-        def is_correctly_constrained(learner):
-            n = 200
+        def is_correctly_constrained(learner, number_categories):
+            n = 1000
+            iterations = 10
             variable_x = np.linspace(0, 1, n).reshape((n, 1))
             fixed_xs_values = np.linspace(0, 1, n)
-            for i in range(n):
+            for i in range(iterations):
                 fixed_x = fixed_xs_values[i] * np.ones((n, 1))
-                monotonically_increasing_x = np.column_stack((variable_x, fixed_x))
+                monotonically_increasing_x = np.column_stack((variable_x, fixed_x,
+                                                              (fixed_x * number_categories).astype(int)))
                 monotonically_increasing_y = learner.predict(monotonically_increasing_x)
-                monotonically_decreasing_x = np.column_stack((fixed_x, variable_x))
+                monotonically_decreasing_x = np.column_stack((fixed_x, variable_x,
+                                                              (fixed_x * number_categories).astype(int)))
                 monotonically_decreasing_y = learner.predict(monotonically_decreasing_x)
-                if not (is_increasing(monotonically_increasing_y) and is_decreasing(monotonically_decreasing_y)):
+                if not (is_increasing(monotonically_increasing_y) and
+                        is_decreasing(monotonically_decreasing_y)):
                     return False
             return True
 
+        number_of_trials = 10
+        for _ in range(number_of_trials):
+            for monotone_precise_mode in [False, True]:
+                number_categories = 2 ** (np.random.randint(1, 12))
+                number_of_dpoints = 3000
+                x1_positively_correlated_with_y = np.random.random(size=number_of_dpoints)
+                x2_negatively_correlated_with_y = np.random.random(size=number_of_dpoints)
+                x3_categorical = (np.random.random(size=number_of_dpoints) * number_categories).astype(int)
+                x = np.column_stack(
+                    (x1_positively_correlated_with_y, x2_negatively_correlated_with_y, x3_categorical))
+                zs = np.random.normal(loc=0.0, scale=0.01, size=number_of_dpoints)
+                scales = 10. * (np.random.random(6) + 0.5)
+                y = (scales[0] * x1_positively_correlated_with_y
+                     + np.sin(scales[1] * np.pi * x1_positively_correlated_with_y)
+                     - scales[2] * x2_negatively_correlated_with_y
+                     - np.cos(scales[3] * np.pi * x2_negatively_correlated_with_y)
+                     - scales[4] * x3_categorical
+                     - np.cos(scales[5] * np.pi * x3_categorical)
+                     + zs)
+                trainset = lgb.Dataset(x, label=y)
+                params = {
+                    'min_data': 20,
+                    'num_leaves': 10,
+                    "num_threads": 1,
+                    'monotone_constraints': '1,-1,0',
+                    "categorical_feature": [2],
+                    "monotone_precise_mode": monotone_precise_mode,
+                    "use_missing": False
+                }
+                constrained_model = lgb.train(params, trainset)
+                self.assertTrue(is_correctly_constrained(constrained_model, number_categories))
+
+    # test if categorical features and monotone features can both be in a dataset without causing issues
+    def generate_trainset_for_monotone_constraints_tests(self):
         number_of_dpoints = 3000
         x1_positively_correlated_with_y = np.random.random(size=number_of_dpoints)
         x2_negatively_correlated_with_y = np.random.random(size=number_of_dpoints)
-        x = np.column_stack((x1_positively_correlated_with_y, x2_negatively_correlated_with_y))
+        x3_negatively_correlated_with_y = np.random.random(size=number_of_dpoints)
+        x = np.column_stack(
+            (x1_positively_correlated_with_y, x2_negatively_correlated_with_y, x3_negatively_correlated_with_y))
         zs = np.random.normal(loc=0.0, scale=0.01, size=number_of_dpoints)
-        y = (5 * x1_positively_correlated_with_y
-             + np.sin(10 * np.pi * x1_positively_correlated_with_y)
-             - 5 * x2_negatively_correlated_with_y
-             - np.cos(10 * np.pi * x2_negatively_correlated_with_y)
+        scales = 10. * (np.random.random(6) + 0.5)
+        y = (scales[0] * x1_positively_correlated_with_y
+             + np.sin(scales[1] * np.pi * x1_positively_correlated_with_y)
+             - scales[2] * x2_negatively_correlated_with_y
+             - np.cos(scales[3] * np.pi * x2_negatively_correlated_with_y)
+             - scales[4] * x3_negatively_correlated_with_y
+             - np.cos(scales[5] * np.pi * x3_negatively_correlated_with_y)
              + zs)
         trainset = lgb.Dataset(x, label=y)
-        params = {
-            'min_data': 20,
-            'num_leaves': 20,
-            'monotone_constraints': '1,-1'
-        }
-        constrained_model = lgb.train(params, trainset)
-        self.assertTrue(is_correctly_constrained(constrained_model))
+        return trainset
+
+    def test_monotone_constraints(self):
+        def is_increasing(y):
+            return (np.diff(y) >= 0.0).all()
+
+        def is_decreasing(y):
+            return (np.diff(y) <= 0.0).all()
+
+        def is_non_monotone(y):
+            return (np.diff(y) < 0.0).any() and (np.diff(y) > 0.0).any()
+
+        def is_correctly_constrained(learner):
+            iterations = 10
+            n = 1000
+            variable_x = np.linspace(0, 1, n).reshape((n, 1))
+            fixed_xs_values = np.linspace(0, 1, n)
+            for i in range(iterations):
+                fixed_x = fixed_xs_values[i] * np.ones((n, 1))
+                monotonically_increasing_x = np.column_stack((variable_x, fixed_x, fixed_x))
+                monotonically_increasing_y = learner.predict(monotonically_increasing_x)
+                monotonically_decreasing_x = np.column_stack((fixed_x, variable_x, fixed_x))
+                monotonically_decreasing_y = learner.predict(monotonically_decreasing_x)
+                non_monotone_x = np.column_stack((fixed_x, fixed_x, variable_x))
+                non_monotone_y = learner.predict(non_monotone_x)
+                if not (is_increasing(monotonically_increasing_y) and
+                        is_decreasing(monotonically_decreasing_y) and
+                        is_non_monotone(non_monotone_y)):
+                    return False
+            return True
+
+        number_of_trials = 10
+        for _ in range(number_of_trials):
+            for monotone_precise_mode in [False, True]:
+                trainset = self.generate_trainset_for_monotone_constraints_tests()
+                params = {
+                    'min_data': 20,
+                    'num_leaves': 20,
+                    'monotone_constraints': '1,-1,0',
+                    "monotone_precise_mode": monotone_precise_mode,
+                    "use_missing": False
+                }
+                constrained_model = lgb.train(params, trainset)
+                self.assertTrue(is_correctly_constrained(constrained_model))
+
+    # test if the monotone penalty is working
+    def test_monotone_penalty(self):
+        def are_first_splits_non_monotone(tree, n, monotone_constraints):
+            if n <= 0:
+                return True
+            if "leaf_value" in tree:
+                return True
+            if monotone_constraints[tree["split_feature"]] != 0:
+                return False
+            return (are_first_splits_non_monotone(tree["left_child"], n - 1, monotone_constraints) and
+                    are_first_splits_non_monotone(tree["right_child"], n - 1, monotone_constraints))
+
+        def are_there_monotone_splits(tree, monotone_constraints):
+            if "leaf_value" in tree:
+                return False
+            if monotone_constraints[tree["split_feature"]] != 0:
+                return True
+            return (are_there_monotone_splits(tree["left_child"], monotone_constraints) or
+                   are_there_monotone_splits(tree["right_child"], monotone_constraints))
+
+        number_of_trials = 10
+        for _ in range(number_of_trials):
+            for monotone_precise_mode in [False, True]:
+                penalization_parameter = np.random.random() * 3
+                trainset = self.generate_trainset_for_monotone_constraints_tests()
+                monotone_constraints = [1, -1, 0]
+                params = {
+                    'min_data': 20,
+                    'num_leaves': 100,
+                    'monotone_constraints': monotone_constraints,
+                    'monotone_penalty': penalization_parameter,
+                    "monotone_precise_mode": monotone_precise_mode,
+                    "use_missing": False
+                }
+                constrained_model = lgb.train(params, trainset, 10)
+                dumped_model = constrained_model.dump_model()["tree_info"]
+                for tree in dumped_model:
+                    self.assert_(are_first_splits_non_monotone(tree["tree_structure"], int(penalization_parameter),
+                                                               monotone_constraints))
+                    self.assert_(are_there_monotone_splits(tree["tree_structure"], monotone_constraints))
+
+    # test if a penalty as high as the depth indeed prohibits all monotone splits
+    def test_monotone_penalty_max(self):
+        number_of_trials = 10
+        for _ in range(number_of_trials):
+            for monotone_precise_mode in [False, True]:
+                max_depth = 5
+                penalization_parameter = max_depth - 1e-10
+                trainset_constrained_model = self.generate_trainset_for_monotone_constraints_tests()
+                x = trainset_constrained_model.data
+                y = trainset_constrained_model.label
+                x3_negatively_correlated_with_y = x[:, 2]
+                monotone_constraints = [1, -1, 0]
+                params_constrained_model = {
+                    'min_data': 20,
+                    'num_leaves': 20,
+                    'monotone_constraints': monotone_constraints,
+                    'monotone_penalty': penalization_parameter,
+                    "max_depth": max_depth,
+                    "monotone_precise_mode": monotone_precise_mode,
+                    "use_missing": False
+                }
+                constrained_model = lgb.train(params_constrained_model, trainset_constrained_model, 10)
+
+                trainset_unconstrained_model = lgb.Dataset(x3_negatively_correlated_with_y.reshape(-1, 1), label=y)
+                params_unconstrained_model = {
+                    'min_data': 20,
+                    'num_leaves': 20,
+                    "max_depth": max_depth
+                }
+                unconstrained_model = lgb.train(params_unconstrained_model, trainset_unconstrained_model, 10)
+
+                self.assert_((constrained_model.predict(x) ==
+                              unconstrained_model.predict(x3_negatively_correlated_with_y.reshape(-1, 1))).all())
 
     def test_max_bin_by_feature(self):
         col1 = np.arange(0, 100)[:, np.newaxis]