ssec-jhu · adam2392 · Jul 6, 2024 · SamuelCarliles3 · Jul 6, 2024 · adam2392
diff --git a/sklearn/tree/_partitioner.pxd b/sklearn/tree/_partitioner.pxd
@@ -10,7 +10,7 @@ cdef float32_t EXTRACT_NNZ_SWITCH = 0.1
 # functions. The alternative would have been to use inheritance-based polymorphism
 # but it would have resulted in a ~10% overall tree fitting performance
 # degradation caused by the overhead frequent virtual method lookups.
-#ctypedef fused Partitioner:
+# ctypedef fused Partitioner:
 #    DensePartitioner
 #    SparsePartitioner
 
@@ -67,8 +67,15 @@ cdef class Partitioner:
             float32_t* min_feature_value_out,
             float32_t* max_feature_value_out,
         ) noexcept nogil
-        inline void next_p(self, intp_t* p_prev, intp_t* p) noexcept nogil
-        inline intp_t partition_samples(self, float64_t current_threshold) noexcept nogil        
+        inline void next_p(
+            self,
+            intp_t* p_prev,
+            intp_t* p
+        ) noexcept nogil
+        inline intp_t partition_samples(
+            self,
+            float64_t current_threshold
+        ) noexcept nogil
         inline void partition_samples_final(
             self,
             intp_t best_pos,

diff --git a/sklearn/tree/_partitioner.pyx b/sklearn/tree/_partitioner.pyx
@@ -6,14 +6,14 @@ from scipy.sparse import issparse
 
 import numpy as np
 
-from ._sort cimport sort, sparse_swap, swap, FEATURE_THRESHOLD
+from ._splitter cimport sort, sparse_swap, FEATURE_THRESHOLD
 
 
 cdef class Partitioner:
     cdef:
         inline void init_node_split(self, intp_t start, intp_t end) noexcept nogil:
             self._init_node_split(self, start, end)
-        
+
         inline void sort_samples_and_feature_values(
             self,
             intp_t current_feature
@@ -33,7 +33,7 @@ cdef class Partitioner:
 
         inline intp_t partition_samples(self, float64_t current_threshold) noexcept nogil:
             return self._partition_samples(self, current_threshold)
-        
+
         inline void partition_samples_final(
             self,
             intp_t best_pos,
@@ -536,22 +536,22 @@ cdef inline void sparse_extract_nnz(SparsePartitioner self, intp_t feature) noex
     if ((1 - self.is_samples_sorted) * n_samples * log(n_samples) +
             n_samples * log(n_indices) < EXTRACT_NNZ_SWITCH * n_indices):
         extract_nnz_binary_search(X_indices, X_data,
-                                    indptr_start, indptr_end,
-                                    samples, self.start, self.end,
-                                    index_to_samples,
-                                    feature_values,
-                                    &self.end_negative, &self.start_positive,
-                                    sorted_samples, &self.is_samples_sorted)
+                                  indptr_start, indptr_end,
+                                  samples, self.start, self.end,
+                                  index_to_samples,
+                                  feature_values,
+                                  &self.end_negative, &self.start_positive,
+                                  sorted_samples, &self.is_samples_sorted)
 
     # Using an index to samples  technique to extract non zero values
     # index_to_samples is a mapping from X_indices to samples
     else:
         extract_nnz_index_to_samples(X_indices, X_data,
-                                        indptr_start, indptr_end,
-                                        samples, self.start, self.end,
-                                        index_to_samples,
-                                        feature_values,
-                                        &self.end_negative, &self.start_positive)
+                                     indptr_start, indptr_end,
+                                     samples, self.start, self.end,
+                                     index_to_samples,
+                                     feature_values,
+                                     &self.end_negative, &self.start_positive)
 
 
 cdef int compare_SIZE_t(const void* a, const void* b) noexcept nogil:

diff --git a/sklearn/tree/_sort.pxd b/sklearn/tree/_sort.pxd
diff --git a/sklearn/tree/_sort.pyx b/sklearn/tree/_sort.pyx
diff --git a/sklearn/tree/_splitter.pxd b/sklearn/tree/_splitter.pxd
@@ -10,6 +10,10 @@ from ._tree cimport ParentInfo
 from ..utils._typedefs cimport float32_t, float64_t, intp_t, int8_t, int32_t, uint32_t
 
 
+# Mitigate precision differences between 32 bit and 64 bit
+cdef float32_t FEATURE_THRESHOLD = 1e-7
+
+
 cdef struct SplitRecord:
     # Data to track sample split
     intp_t feature         # Which feature to split on.
@@ -132,3 +136,16 @@ cdef void shift_missing_values_to_left_if_required(
     intp_t[::1] samples,
     intp_t end,
 ) noexcept nogil
+
+
+# Sort n-element arrays pointed to by feature_values and samples, simultaneously,
+# by the values in feature_values. Algorithm: Introsort (Musser, SP&E, 1997).
+cdef void sort(float32_t* feature_values, intp_t* samples, intp_t n) noexcept nogil
+
+cdef void swap(float32_t* feature_values, intp_t* samples, intp_t i, intp_t j) noexcept nogil
+cdef void sparse_swap(
+    intp_t[::1] index_to_samples,
+    intp_t[::1] samples,
+    intp_t pos_1,
+    intp_t pos_2
+) noexcept nogil