Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

LogMergePolicy knob del_docs_percentage_before_merge #1238

Merged
merged 10 commits into from
Dec 20, 2021
2 changes: 1 addition & 1 deletion src/indexer/index_writer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -974,7 +974,7 @@ mod tests {
assert_eq!(
format!("{:?}", index_writer.get_merge_policy()),
"LogMergePolicy { min_num_segments: 8, max_docs_before_merge: 10000000, min_layer_size: 10000, \
level_log_size: 0.75 }"
level_log_size: 0.75, del_docs_percentage_before_merge: 100 }"
);
shikhar marked this conversation as resolved.
Show resolved Hide resolved
let merge_policy = Box::new(NoMergePolicy::default());
index_writer.set_merge_policy(merge_policy);
Expand Down
38 changes: 32 additions & 6 deletions src/indexer/log_merge_policy.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,12 @@ use super::merge_policy::{MergeCandidate, MergePolicy};
use crate::core::SegmentMeta;
use itertools::Itertools;
use std::cmp;
use std::f64;

const DEFAULT_LEVEL_LOG_SIZE: f64 = 0.75;
const DEFAULT_MIN_LAYER_SIZE: u32 = 10_000;
const DEFAULT_MIN_NUM_SEGMENTS_IN_MERGE: usize = 8;
const DEFAULT_MAX_DOCS_BEFORE_MERGE: usize = 10_000_000;
const DEFAULT_DEL_DOCS_PERCENTAGE_BEFORE_MERGE: u8 = 100;
fulmicoton marked this conversation as resolved.
Show resolved Hide resolved

/// `LogMergePolicy` tries to merge segments that have a similar number of
/// documents.
Expand All @@ -17,6 +17,7 @@ pub struct LogMergePolicy {
max_docs_before_merge: usize,
min_layer_size: u32,
level_log_size: f64,
del_docs_percentage_before_merge: u8,
}

impl LogMergePolicy {
Expand Down Expand Up @@ -52,19 +53,40 @@ impl LogMergePolicy {
pub fn set_level_log_size(&mut self, level_log_size: f64) {
self.level_log_size = level_log_size;
}

/// Set the percentage of deleted documents in a segment to tolerate.
/// If it is exceeded by any segment at a log level, a merge
/// will be triggered for that level.
///
/// If there is a single segment at a level, we effectively end up expunging
/// deleted documents from it.
pub fn set_del_docs_percentage_before_merge(&mut self, del_docs_percentage_before_merge: u8) {
assert!(del_docs_percentage_before_merge <= 100);
self.del_docs_percentage_before_merge = del_docs_percentage_before_merge;
}

fn has_segment_above_deletes_threshold(&self, level: &[&SegmentMeta]) -> bool {
level
.iter()
.any(|segment| deletes_percentage(segment) > self.del_docs_percentage_before_merge)
}
}

fn deletes_percentage(segment: &SegmentMeta) -> u8 {
(segment.num_deleted_docs() as u64 * 100 / segment.max_doc() as u64) as u8
}

impl MergePolicy for LogMergePolicy {
fn compute_merge_candidates(&self, segments: &[SegmentMeta]) -> Vec<MergeCandidate> {
let mut size_sorted_segments = segments
let size_sorted_segments = segments
.iter()
.filter(|segment_meta| segment_meta.num_docs() <= (self.max_docs_before_merge as u32))
.filter(|seg| seg.num_docs() <= (self.max_docs_before_merge as u32))
.sorted_by_key(|seg| std::cmp::Reverse(seg.max_doc()))
.collect::<Vec<&SegmentMeta>>();

if size_sorted_segments.len() <= 1 {
if size_sorted_segments.is_empty() {
return vec![];
}
size_sorted_segments.sort_by_key(|seg| std::cmp::Reverse(seg.num_docs()));

let mut current_max_log_size = f64::MAX;
let mut levels = vec![];
Expand All @@ -82,7 +104,10 @@ impl MergePolicy for LogMergePolicy {

levels
.iter()
.filter(|level| level.len() >= self.min_num_segments)
.filter(|level| {
level.len() >= self.min_num_segments
|| self.has_segment_above_deletes_threshold(level)
})
.map(|segments| MergeCandidate(segments.iter().map(|&seg| seg.id()).collect()))
.collect()
}
Expand All @@ -95,6 +120,7 @@ impl Default for LogMergePolicy {
max_docs_before_merge: DEFAULT_MAX_DOCS_BEFORE_MERGE,
min_layer_size: DEFAULT_MIN_LAYER_SIZE,
level_log_size: DEFAULT_LEVEL_LOG_SIZE,
del_docs_percentage_before_merge: DEFAULT_DEL_DOCS_PERCENTAGE_BEFORE_MERGE,
}
}
}
Expand Down