From 2a39289a1b0cb4e1a064552ce8c224f3ebee28a8 Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Wed, 16 Nov 2022 07:18:34 +0900 Subject: [PATCH] Handle escaped dot in json path in the QueryParser. (#1682) --- src/indexer/json_term_writer.rs | 83 +++++++++++++++++++++++++- src/indexer/mod.rs | 27 ++++++++- src/query/query_parser/query_parser.rs | 22 +++++++ 3 files changed, 126 insertions(+), 6 deletions(-) diff --git a/src/indexer/json_term_writer.rs b/src/indexer/json_term_writer.rs index 770ac15cdd..71543acd37 100644 --- a/src/indexer/json_term_writer.rs +++ b/src/indexer/json_term_writer.rs @@ -261,6 +261,39 @@ pub struct JsonTermWriter<'a> { path_stack: Vec, } +/// Splits a json path supplied to the query parser in such a way that +/// `.` can be escaped. +/// +/// In other words, +/// - `k8s.node` ends up as `["k8s", "node"]`. +/// - `k8s\.node` ends up as `["k8s.node"]`. +fn split_json_path(json_path: &str) -> Vec { + let mut escaped_state: bool = false; + let mut json_path_segments = Vec::new(); + let mut buffer = String::new(); + for ch in json_path.chars() { + if escaped_state { + buffer.push(ch); + escaped_state = false; + continue; + } + match ch { + '\\' => { + escaped_state = true; + } + '.' => { + let new_segment = std::mem::take(&mut buffer); + json_path_segments.push(new_segment); + } + _ => { + buffer.push(ch); + } + } + } + json_path_segments.push(buffer); + json_path_segments +} + impl<'a> JsonTermWriter<'a> { pub fn from_field_and_json_path( field: Field, @@ -269,8 +302,8 @@ impl<'a> JsonTermWriter<'a> { ) -> Self { term_buffer.set_field_and_type(field, Type::Json); let mut json_term_writer = Self::wrap(term_buffer); - for segment in json_path.split('.') { - json_term_writer.push_path_segment(segment); + for segment in split_json_path(json_path) { + json_term_writer.push_path_segment(&segment); } json_term_writer } @@ -350,7 +383,7 @@ impl<'a> JsonTermWriter<'a> { #[cfg(test)] mod tests { - use super::JsonTermWriter; + use super::{split_json_path, JsonTermWriter}; use crate::schema::{Field, Type}; use crate::Term; @@ -495,4 +528,48 @@ mod tests { json_writer.set_str("pink"); assert_eq!(json_writer.path(), b"color\x01hue"); } + + #[test] + fn test_split_json_path_simple() { + let json_path = split_json_path("titi.toto"); + assert_eq!(&json_path, &["titi", "toto"]); + } + + #[test] + fn test_split_json_path_single_segment() { + let json_path = split_json_path("toto"); + assert_eq!(&json_path, &["toto"]); + } + + #[test] + fn test_split_json_path_trailing_dot() { + let json_path = split_json_path("toto."); + assert_eq!(&json_path, &["toto", ""]); + } + + #[test] + fn test_split_json_path_heading_dot() { + let json_path = split_json_path(".toto"); + assert_eq!(&json_path, &["", "toto"]); + } + + #[test] + fn test_split_json_path_escaped_dot() { + let json_path = split_json_path(r#"toto\.titi"#); + assert_eq!(&json_path, &["toto.titi"]); + let json_path_2 = split_json_path(r#"k8s\.container\.name"#); + assert_eq!(&json_path_2, &["k8s.container.name"]); + } + + #[test] + fn test_split_json_path_escaped_backslash() { + let json_path = split_json_path(r#"toto\\titi"#); + assert_eq!(&json_path, &[r#"toto\titi"#]); + } + + #[test] + fn test_split_json_path_escaped_normal_letter() { + let json_path = split_json_path(r#"toto\titi"#); + assert_eq!(&json_path, &[r#"tototiti"#]); + } } diff --git a/src/indexer/mod.rs b/src/indexer/mod.rs index c557350cf1..c55c241f1b 100644 --- a/src/indexer/mod.rs +++ b/src/indexer/mod.rs @@ -58,13 +58,15 @@ type AddBatchReceiver = channel::Receiver; #[cfg(feature = "mmap")] #[cfg(test)] mod tests_mmap { - use crate::schema::{self, Schema}; + use crate::collector::Count; + use crate::query::QueryParser; + use crate::schema::{Schema, STORED, TEXT}; use crate::{Index, Term}; #[test] fn test_advance_delete_bug() -> crate::Result<()> { let mut schema_builder = Schema::builder(); - let text_field = schema_builder.add_text_field("text", schema::TEXT); + let text_field = schema_builder.add_text_field("text", TEXT); let index = Index::create_from_tempdir(schema_builder.build())?; let mut index_writer = index.writer_for_tests()?; // there must be one deleted document in the segment @@ -75,7 +77,26 @@ mod tests_mmap { index_writer.add_document(doc!(text_field=>"c"))?; } index_writer.commit()?; - index_writer.commit()?; Ok(()) } + + #[test] + fn test_json_field_espace() { + let mut schema_builder = Schema::builder(); + let json_field = schema_builder.add_json_field("json", TEXT | STORED); + let index = Index::create_in_ram(schema_builder.build()); + let mut index_writer = index.writer_for_tests().unwrap(); + let json = serde_json::json!({"k8s.container.name": "prometheus", "val": "hello"}); + index_writer.add_document(doc!(json_field=>json)).unwrap(); + index_writer.commit().unwrap(); + let reader = index.reader().unwrap(); + let searcher = reader.searcher(); + assert_eq!(searcher.num_docs(), 1); + let parse_query = QueryParser::for_index(&index, Vec::new()); + let query = parse_query + .parse_query(r#"json.k8s\.container\.name:prometheus"#) + .unwrap(); + let num_docs = searcher.search(&query, &Count).unwrap(); + assert_eq!(num_docs, 1); + } } diff --git a/src/query/query_parser/query_parser.rs b/src/query/query_parser/query_parser.rs index 07feb0e579..19f3cf21c2 100644 --- a/src/query/query_parser/query_parser.rs +++ b/src/query/query_parser/query_parser.rs @@ -1062,6 +1062,28 @@ mod test { ); } + fn extract_query_term_json_path(query: &str) -> String { + let LogicalAst::Leaf(literal) = parse_query_to_logical_ast(query, false).unwrap() else { + panic!(); + }; + let LogicalLiteral::Term(term) = *literal else { + panic!(); + }; + std::str::from_utf8(term.value_bytes()).unwrap().to_string() + } + + #[test] + fn test_json_field_query_with_espaced_dot() { + assert_eq!( + extract_query_term_json_path(r#"json.k8s.node.name:hello"#), + "k8s\u{1}node\u{1}name\0shello" + ); + assert_eq!( + extract_query_term_json_path(r#"json.k8s\.node\.name:hello"#), + "k8s.node.name\0shello" + ); + } + #[test] fn test_json_field_possibly_a_number() { test_parse_query_to_logical_ast_helper(