From 014328e378ff313a0b7f00f124e02d9230aa2039 Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Tue, 9 Jan 2024 22:52:20 +0900 Subject: [PATCH] Fix bug that can cause `get_docids_for_value_range` to panic. (#2295) * Fix bug that can cause `get_docids_for_value_range` to panic. When `selected_docid_range.end == num_rows`, we would get a panic as we try to access a non-existing blockmeta. This PR accepts calls to rank with any value. For any value above num_rows we simply return non_null_rows. Fixes #2293 * add tests, merge variables --------- Co-authored-by: Pascal Seitz --- columnar/src/column_index/mod.rs | 12 +++---- .../src/column_index/optional_index/mod.rs | 17 ++++++---- .../src/column_index/optional_index/set.rs | 3 +- .../src/column_index/optional_index/tests.rs | 32 ++++++++++++++++--- columnar/src/column_values/mod.rs | 2 +- 5 files changed, 48 insertions(+), 18 deletions(-) diff --git a/columnar/src/column_index/mod.rs b/columnar/src/column_index/mod.rs index d6711566d4..6ae7046c82 100644 --- a/columnar/src/column_index/mod.rs +++ b/columnar/src/column_index/mod.rs @@ -126,18 +126,18 @@ impl ColumnIndex { } } - pub fn docid_range_to_rowids(&self, doc_id: Range) -> Range { + pub fn docid_range_to_rowids(&self, doc_id_range: Range) -> Range { match self { ColumnIndex::Empty { .. } => 0..0, - ColumnIndex::Full => doc_id, + ColumnIndex::Full => doc_id_range, ColumnIndex::Optional(optional_index) => { - let row_start = optional_index.rank(doc_id.start); - let row_end = optional_index.rank(doc_id.end); + let row_start = optional_index.rank(doc_id_range.start); + let row_end = optional_index.rank(doc_id_range.end); row_start..row_end } ColumnIndex::Multivalued(multivalued_index) => { - let end_docid = doc_id.end.min(multivalued_index.num_docs() - 1) + 1; - let start_docid = doc_id.start.min(end_docid); + let end_docid = doc_id_range.end.min(multivalued_index.num_docs() - 1) + 1; + let start_docid = doc_id_range.start.min(end_docid); let row_start = multivalued_index.start_index_column.get_val(start_docid); let row_end = multivalued_index.start_index_column.get_val(end_docid); diff --git a/columnar/src/column_index/optional_index/mod.rs b/columnar/src/column_index/optional_index/mod.rs index e885ee5bcb..d484152843 100644 --- a/columnar/src/column_index/optional_index/mod.rs +++ b/columnar/src/column_index/optional_index/mod.rs @@ -21,8 +21,6 @@ const DENSE_BLOCK_THRESHOLD: u32 = const ELEMENTS_PER_BLOCK: u32 = u16::MAX as u32 + 1; -const BLOCK_SIZE: RowId = 1 << 16; - #[derive(Copy, Clone, Debug)] struct BlockMeta { non_null_rows_before_block: u32, @@ -109,8 +107,8 @@ struct RowAddr { #[inline(always)] fn row_addr_from_row_id(row_id: RowId) -> RowAddr { RowAddr { - block_id: (row_id / BLOCK_SIZE) as u16, - in_block_row_id: (row_id % BLOCK_SIZE) as u16, + block_id: (row_id / ELEMENTS_PER_BLOCK) as u16, + in_block_row_id: (row_id % ELEMENTS_PER_BLOCK) as u16, } } @@ -185,8 +183,13 @@ impl Set for OptionalIndex { } } + /// Any value doc_id is allowed. + /// In particular, doc_id = num_rows. #[inline] fn rank(&self, doc_id: DocId) -> RowId { + if doc_id >= self.num_docs() { + return self.num_non_nulls(); + } let RowAddr { block_id, in_block_row_id, @@ -200,13 +203,15 @@ impl Set for OptionalIndex { block_meta.non_null_rows_before_block + block_offset_row_id } + /// Any value doc_id is allowed. + /// In particular, doc_id = num_rows. #[inline] fn rank_if_exists(&self, doc_id: DocId) -> Option { let RowAddr { block_id, in_block_row_id, } = row_addr_from_row_id(doc_id); - let block_meta = self.block_metas[block_id as usize]; + let block_meta = *self.block_metas.get(block_id as usize)?; let block = self.block(block_meta); let block_offset_row_id = match block { Block::Dense(dense_block) => dense_block.rank_if_exists(in_block_row_id), @@ -491,7 +496,7 @@ fn deserialize_optional_index_block_metadatas( non_null_rows_before_block += num_non_null_rows; } block_metas.resize( - ((num_rows + BLOCK_SIZE - 1) / BLOCK_SIZE) as usize, + ((num_rows + ELEMENTS_PER_BLOCK - 1) / ELEMENTS_PER_BLOCK) as usize, BlockMeta { non_null_rows_before_block, start_byte_offset, diff --git a/columnar/src/column_index/optional_index/set.rs b/columnar/src/column_index/optional_index/set.rs index 15b527dbd3..b2bf9cbe2c 100644 --- a/columnar/src/column_index/optional_index/set.rs +++ b/columnar/src/column_index/optional_index/set.rs @@ -39,7 +39,8 @@ pub trait Set { /// /// # Panics /// - /// May panic if rank is greater than the number of elements in the Set. + /// May panic if rank is greater or equal to the number of + /// elements in the Set. fn select(&self, rank: T) -> T; /// Creates a brand new select cursor. diff --git a/columnar/src/column_index/optional_index/tests.rs b/columnar/src/column_index/optional_index/tests.rs index 2acc5f6e6f..85dfcbd9a8 100644 --- a/columnar/src/column_index/optional_index/tests.rs +++ b/columnar/src/column_index/optional_index/tests.rs @@ -3,6 +3,30 @@ use proptest::strategy::Strategy; use proptest::{prop_oneof, proptest}; use super::*; +use crate::{ColumnarReader, ColumnarWriter, DynamicColumnHandle}; + +#[test] +fn test_optional_index_bug_2293() { + // tests for panic in docid_range_to_rowids for docid == num_docs + test_optional_index_with_num_docs(ELEMENTS_PER_BLOCK - 1); + test_optional_index_with_num_docs(ELEMENTS_PER_BLOCK); + test_optional_index_with_num_docs(ELEMENTS_PER_BLOCK + 1); +} +fn test_optional_index_with_num_docs(num_docs: u32) { + let mut dataframe_writer = ColumnarWriter::default(); + dataframe_writer.record_numerical(100, "score", 80i64); + let mut buffer: Vec = Vec::new(); + dataframe_writer + .serialize(num_docs, None, &mut buffer) + .unwrap(); + let columnar = ColumnarReader::open(buffer).unwrap(); + assert_eq!(columnar.num_columns(), 1); + let cols: Vec = columnar.read_columns("score").unwrap(); + assert_eq!(cols.len(), 1); + + let col = cols[0].open().unwrap(); + col.column_index().docid_range_to_rowids(0..num_docs); +} #[test] fn test_dense_block_threshold() { @@ -35,7 +59,7 @@ proptest! { #[test] fn test_with_random_sets_simple() { - let vals = 10..BLOCK_SIZE * 2; + let vals = 10..ELEMENTS_PER_BLOCK * 2; let mut out: Vec = Vec::new(); serialize_optional_index(&vals, 100, &mut out).unwrap(); let null_index = open_optional_index(OwnedBytes::new(out)).unwrap(); @@ -171,7 +195,7 @@ fn test_optional_index_rank() { test_optional_index_rank_aux(&[0u32, 1u32]); let mut block = Vec::new(); block.push(3u32); - block.extend((0..BLOCK_SIZE).map(|i| i + BLOCK_SIZE + 1)); + block.extend((0..ELEMENTS_PER_BLOCK).map(|i| i + ELEMENTS_PER_BLOCK + 1)); test_optional_index_rank_aux(&block); } @@ -185,8 +209,8 @@ fn test_optional_index_iter_empty_one() { fn test_optional_index_iter_dense_block() { let mut block = Vec::new(); block.push(3u32); - block.extend((0..BLOCK_SIZE).map(|i| i + BLOCK_SIZE + 1)); - test_optional_index_iter_aux(&block, 3 * BLOCK_SIZE); + block.extend((0..ELEMENTS_PER_BLOCK).map(|i| i + ELEMENTS_PER_BLOCK + 1)); + test_optional_index_iter_aux(&block, 3 * ELEMENTS_PER_BLOCK); } #[test] diff --git a/columnar/src/column_values/mod.rs b/columnar/src/column_values/mod.rs index f2e1b036a2..9b94b7e94f 100644 --- a/columnar/src/column_values/mod.rs +++ b/columnar/src/column_values/mod.rs @@ -101,7 +101,7 @@ pub trait ColumnValues: Send + Sync { row_id_hits: &mut Vec, ) { let row_id_range = row_id_range.start..row_id_range.end.min(self.num_vals()); - for idx in row_id_range.start..row_id_range.end { + for idx in row_id_range { let val = self.get_val(idx); if value_range.contains(&val) { row_id_hits.push(idx);