databendlabs · BohuTANG · Nov 26, 2024 · Nov 25, 2024 · Nov 25, 2024 · Nov 25, 2024
diff --git a/src/query/sql/src/planner/optimizer/property/histogram.rs b/src/query/sql/src/planner/optimizer/property/histogram.rs
@@ -77,13 +77,13 @@ pub fn histogram_from_ndv(
     let mut buckets: Vec<HistogramBucket> = Vec::with_capacity(num_buckets);
     let sample_set = UniformSampleSet { min, max };
 
-    for idx in 0..num_buckets + 1 {
-        let upper_bound = sample_set.get_upper_bound(num_buckets, idx)?;
+    for idx in 0..num_buckets {
         let lower_bound = if idx == 0 {
             sample_set.min.clone()
         } else {
             buckets[idx - 1].upper_bound().clone()
         };
+        let upper_bound = sample_set.get_upper_bound(num_buckets, idx + 1)?;
         let bucket = HistogramBucket::new(
             lower_bound,
             upper_bound,

diff --git a/src/query/sql/src/planner/optimizer/property/selectivity.rs b/src/query/sql/src/planner/optimizer/property/selectivity.rs
@@ -12,6 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+use std::cmp::max;
 use std::cmp::Ordering;
 use std::collections::HashSet;
 
@@ -27,8 +28,10 @@ use databend_common_expression::FunctionContext;
 use databend_common_expression::Scalar;
 use databend_common_functions::BUILTIN_FUNCTIONS;
 use databend_common_storage::Datum;
+use databend_common_storage::DEFAULT_HISTOGRAM_BUCKETS;
 use databend_common_storage::F64;
 
+use crate::optimizer::histogram_from_ndv;
 use crate::optimizer::ColumnStat;
 use crate::optimizer::Statistics;
 use crate::plans::BoundColumnRef;
@@ -323,8 +326,10 @@ impl<'a> SelectivityEstimator<'a> {
                 column_stat.ndv = new_ndv;
                 if let Some(histogram) = &mut column_stat.histogram {
                     if histogram.accuracy {
+                        // If selectivity < 0.2, most buckets are invalid and
+                        // the accuracy histogram can be discarded.
                         // Todo: find a better way to update histogram.
-                        if selectivity < 0.8 {
+                        if selectivity < 0.2 {
                             column_stat.histogram = None;
                         }
                         continue;
@@ -349,13 +354,14 @@ impl<'a> SelectivityEstimator<'a> {
         column_stat: &mut ColumnStat,
         updated_column_indexes: &mut HashSet<IndexType>,
     ) -> Result<f64> {
-        let col_hist = column_stat.histogram.as_ref();
+        let histogram = column_stat.histogram.as_ref();
 
-        if col_hist.is_none() && const_datum.is_numeric() {
+        if histogram.is_none() && const_datum.is_numeric() {
             // If there is no histogram and the column isn't numeric, return default selectivity.
-            if !column_stat.min.is_numeric() {
+            if !column_stat.min.is_numeric() || !column_stat.max.is_numeric() {
                 return Ok(DEFAULT_SELECTIVITY);
             }
+
             let min = column_stat.min.to_double()?;
             let max = column_stat.max.to_double()?;
             let ndv = column_stat.ndv;
@@ -427,69 +433,93 @@ impl<'a> SelectivityEstimator<'a> {
             return Ok(percent);
         }
 
-        if col_hist.is_none() {
+        let Some(histogram) = histogram else {
             return Ok(DEFAULT_SELECTIVITY);
-        }
-        let col_hist = col_hist.unwrap();
-        let (mut num_greater, new_min, new_max) = match comparison_op {
-            ComparisonOp::GT | ComparisonOp::GTE => {
-                let new_min = const_datum.clone();
-                let new_max = column_stat.max.clone();
-                (0.0, new_min, new_max)
-            }
-            ComparisonOp::LT | ComparisonOp::LTE => {
-                let new_max = const_datum.clone();
-                let new_min = column_stat.min.clone();
-                (0.0, new_min, new_max)
-            }
-            _ => unreachable!(),
         };
 
-        for bucket in col_hist.buckets_iter() {
-            if let Ok(ord) = bucket.upper_bound().compare(const_datum) {
-                match comparison_op {
-                    ComparisonOp::GT => {
-                        if ord == Ordering::Less || ord == Ordering::Equal {
-                            num_greater += bucket.num_values();
-                        } else {
-                            break;
-                        }
-                    }
-                    ComparisonOp::GTE => {
-                        if ord == Ordering::Less {
-                            num_greater += bucket.num_values();
-                        } else {
-                            break;
-                        }
-                    }
-                    ComparisonOp::LT => {
-                        if ord == Ordering::Less {
-                            num_greater += bucket.num_values();
+        let mut num_selected = 0.0;
+        for bucket in histogram.buckets_iter() {
+            let lower_bound = bucket.lower_bound();
+            let upper_bound = bucket.upper_bound();
+
+            let const_gte_upper_bound = matches!(
+                const_datum.compare(upper_bound)?,
+                Ordering::Greater | Ordering::Equal
+            );
+            let (no_overlap, complete_overlap) = match comparison_op {
+                ComparisonOp::LT => (
+                    matches!(
+                        const_datum.compare(lower_bound)?,
+                        Ordering::Less | Ordering::Equal
+                    ),
+                    const_gte_upper_bound,
+                ),
+                ComparisonOp::LTE => (
+                    matches!(const_datum.compare(lower_bound)?, Ordering::Less),
+                    const_gte_upper_bound,
+                ),
+                ComparisonOp::GT => (
+                    const_gte_upper_bound,
+                    matches!(const_datum.compare(lower_bound)?, Ordering::Less),
+                ),
+                ComparisonOp::GTE => (
+                    const_gte_upper_bound,
+                    matches!(
+                        const_datum.compare(lower_bound)?,
+                        Ordering::Less | Ordering::Equal
+                    ),
+                ),
+                _ => unreachable!(),
+            };
+
+            if complete_overlap {
+                num_selected += bucket.num_values();
+            } else if !no_overlap && const_datum.is_numeric() {
+                let ndv = bucket.num_distinct();
+                let lower_bound = lower_bound.to_double()?;
+                let upper_bound = upper_bound.to_double()?;
+                let const_value = const_datum.to_double()?;
+
+                let bucket_range = upper_bound - lower_bound;
+                let bucket_selectivity = match comparison_op {
+                    ComparisonOp::LT => (const_value - lower_bound) / bucket_range,
+                    ComparisonOp::LTE => {
+                        if const_value == lower_bound {
+                            1.0 / ndv
                         } else {
-                            break;
+                            (const_value - lower_bound + 1.0) / bucket_range
                         }
                     }
-                    ComparisonOp::LTE => {
-                        if ord == Ordering::Less || ord == Ordering::Equal {
-                            num_greater += bucket.num_values();
+                    ComparisonOp::GT => {
+                        if const_value == lower_bound {
+                            1.0 - 1.0 / ndv
                         } else {
-                            break;
+                            (upper_bound - const_value - 1.0).max(0.0) / bucket_range
                         }
                     }
+                    ComparisonOp::GTE => (upper_bound - const_value) / bucket_range,
                     _ => unreachable!(),
-                }
-            } else {
-                return Ok(DEFAULT_SELECTIVITY);
+                };
+                num_selected += bucket.num_values() * bucket_selectivity;
             }
         }
 
-        let selectivity = match comparison_op {
-            ComparisonOp::GT | ComparisonOp::GTE => 1.0 - num_greater / col_hist.num_values(),
-            ComparisonOp::LT | ComparisonOp::LTE => num_greater / col_hist.num_values(),
-            _ => unreachable!(),
-        };
+        let selectivity = num_selected / histogram.num_values();
 
         if update {
+            let (new_min, new_max) = match comparison_op {
+                ComparisonOp::GT | ComparisonOp::GTE => {
+                    let new_min = const_datum.clone();
+                    let new_max = column_stat.max.clone();
+                    (new_min, new_max)
+                }
+                ComparisonOp::LT | ComparisonOp::LTE => {
+                    let new_max = const_datum.clone();
+                    let new_min = column_stat.min.clone();
+                    (new_min, new_max)
+                }
+                _ => unreachable!(),
+            };
             update_statistic(column_stat, new_min, new_max, selectivity)?;
             updated_column_indexes.insert(column_ref.column.index);
         }
@@ -575,11 +605,31 @@ fn update_statistic(
         new_min = Datum::Float(F64::from(new_min.to_double()?));
         new_max = Datum::Float(F64::from(new_max.to_double()?));
     }
-    if selectivity < 0.8 {
-        // Todo: support unfixed buckets number for histogram and prune the histogram.
-        column_stat.histogram = None;
-    }
     column_stat.min = new_min.clone();
     column_stat.max = new_max.clone();
+
+    if let Some(histogram) = &column_stat.histogram {
+        // If selectivity < 0.2, most buckets are invalid and
+        // the accuracy histogram can be discarded.
+        // Todo: support unfixed buckets number for histogram and prune the histogram.
+        column_stat.histogram = if histogram.accuracy && selectivity >= 0.2 {
+            Some(histogram.clone())
+        } else {
+            let num_values = histogram.num_values();
+            let new_num_values = (num_values * selectivity).ceil() as u64;
+            let new_ndv = new_ndv as u64;
+            if new_ndv <= 2 {
+                column_stat.histogram = None;
+                return Ok(());
+            }
+            Some(histogram_from_ndv(
+                new_ndv,
+                max(new_num_values, new_ndv),
+                Some((new_min, new_max)),
+                DEFAULT_HISTOGRAM_BUCKETS,
+            )?)
+        }
+    }
+
     Ok(())
 }