From ef564c06f95474e1916869b18fc74b1c7d06bac4 Mon Sep 17 00:00:00 2001 From: Pascal Seitz Date: Wed, 20 Dec 2023 15:09:58 +0800 Subject: [PATCH] fix merge panic for JSON fields Root cause was the positions buffer had residue positions from the previous term, when the terms were alternating between having and not having positions in JSON (terms have positions, but not numerics). Fixes #2283 --- src/indexer/index_writer.rs | 3 +++ src/indexer/merger.rs | 4 ++++ src/indexer/segment_writer.rs | 25 +++++++++++++++++++++++++ 3 files changed, 32 insertions(+) diff --git a/src/indexer/index_writer.rs b/src/indexer/index_writer.rs index 2323806d12..f2e5477501 100644 --- a/src/indexer/index_writer.rs +++ b/src/indexer/index_writer.rs @@ -1651,6 +1651,7 @@ mod tests { force_end_merge: bool, ) -> crate::Result { let mut schema_builder = schema::Schema::builder(); + let json_field = schema_builder.add_json_field("json", FAST | TEXT | STORED); let ip_field = schema_builder.add_ip_addr_field("ip", FAST | INDEXED | STORED); let ips_field = schema_builder .add_ip_addr_field("ips", IpAddrOptions::default().set_fast().set_indexed()); @@ -1729,7 +1730,9 @@ mod tests { id_field=>id, ))?; } else { + let json = json!({"date1": format!("2022-{id}-01T00:00:01Z"), "date2": format!("{id}-05-01T00:00:01Z"), "id": id, "ip": ip.to_string()}); index_writer.add_document(doc!(id_field=>id, + json_field=>json, bytes_field => id.to_le_bytes().as_slice(), id_opt_field => id, ip_field => ip, diff --git a/src/indexer/merger.rs b/src/indexer/merger.rs index 87bc4c8c8c..8612f66c55 100644 --- a/src/indexer/merger.rs +++ b/src/indexer/merger.rs @@ -605,6 +605,10 @@ impl IndexMerger { segment_postings.positions(&mut positions_buffer); segment_postings.term_freq() } else { + // The positions_buffer may contain positions from the previous term + // Existence of positions depend on the value type in JSON fields. + // https://github.com/quickwit-oss/tantivy/issues/2283 + positions_buffer.clear(); 0u32 }; diff --git a/src/indexer/segment_writer.rs b/src/indexer/segment_writer.rs index 1888f3b47e..c0bd8d4403 100644 --- a/src/indexer/segment_writer.rs +++ b/src/indexer/segment_writer.rs @@ -879,6 +879,31 @@ mod tests { assert_eq!(searcher.search(&phrase_query, &Count).unwrap(), 0); } + #[test] + fn test_json_term_with_numeric_merge_panic_regression_bug_2283() { + // https://github.com/quickwit-oss/tantivy/issues/2283 + let mut schema_builder = Schema::builder(); + let json = schema_builder.add_json_field("json", TEXT); + let schema = schema_builder.build(); + let index = Index::create_in_ram(schema); + let mut writer = index.writer_for_tests().unwrap(); + let doc = json!({"field": "a"}); + writer.add_document(doc!(json=>doc)).unwrap(); + writer.commit().unwrap(); + let doc = json!({"field": "a", "id": 1}); + writer.add_document(doc!(json=>doc.clone())).unwrap(); + writer.commit().unwrap(); + + // Force Merge + writer.wait_merging_threads().unwrap(); + let mut index_writer: IndexWriter = index.writer_for_tests().unwrap(); + let segment_ids = index + .searchable_segment_ids() + .expect("Searchable segments failed."); + index_writer.merge(&segment_ids).wait().unwrap(); + assert!(index_writer.wait_merging_threads().is_ok()); + } + #[test] fn test_bug_regression_1629_position_when_array_with_a_field_value_that_does_not_contain_any_token( ) {