diff --git a/quickwit/Cargo.lock b/quickwit/Cargo.lock index 233fa27d139..3ce5526bd7a 100644 --- a/quickwit/Cargo.lock +++ b/quickwit/Cargo.lock @@ -5894,7 +5894,6 @@ version = "0.8.0" dependencies = [ "anyhow", "itertools 0.13.0", - "ouroboros", "serde", "serde_json", "tantivy", diff --git a/quickwit/quickwit-datetime/Cargo.toml b/quickwit/quickwit-datetime/Cargo.toml index c30e6b029e1..004e959a348 100644 --- a/quickwit/quickwit-datetime/Cargo.toml +++ b/quickwit/quickwit-datetime/Cargo.toml @@ -13,7 +13,6 @@ license.workspace = true [dependencies] anyhow = { workspace = true } itertools = { workspace = true } -ouroboros = "0.18.0" serde = { workspace = true } serde_json = { workspace = true } tantivy = { workspace = true } diff --git a/quickwit/quickwit-datetime/src/date_time_format.rs b/quickwit/quickwit-datetime/src/date_time_format.rs index 42b282ef6db..1758e289113 100644 --- a/quickwit/quickwit-datetime/src/date_time_format.rs +++ b/quickwit/quickwit-datetime/src/date_time_format.rs @@ -20,138 +20,14 @@ use std::fmt::Display; use std::str::FromStr; -use ouroboros::self_referencing; use serde::de::Error; use serde::{Deserialize, Deserializer, Serialize}; use serde_json::Value as JsonValue; -use time::error::Format; use time::format_description::well_known::{Iso8601, Rfc2822, Rfc3339}; -use time::format_description::FormatItem; -use time::parsing::Parsed; -use time::{Month, OffsetDateTime, PrimitiveDateTime}; -use time_fmt::parse::time_format_item::parse_to_format_item; - -use crate::TantivyDateTime; - -/// A date time parser that holds the format specification `Vec`. -#[self_referencing] -pub struct StrptimeParser { - strptime_format: String, - with_timezone: bool, - #[borrows(strptime_format)] - #[covariant] - items: Vec>, -} - -impl FromStr for StrptimeParser { - type Err = String; - - fn from_str(strptime_format: &str) -> Result { - StrptimeParser::try_new( - strptime_format.to_string(), - strptime_format.to_lowercase().contains("%z"), - |strptime_format: &String| { - parse_to_format_item(strptime_format).map_err(|error| { - format!("invalid strptime format `{strptime_format}`: {error}") - }) - }, - ) - } -} - -impl StrptimeParser { - /// Parse a given date according to the datetime format specified during the StrptimeParser - /// creation. If the date format does not provide a specific a time, the time will be set to - /// 00:00:00. - fn parse_primitive_date_time(&self, date_time_str: &str) -> anyhow::Result { - let mut parsed = Parsed::new(); - if !parsed - .parse_items(date_time_str.as_bytes(), self.borrow_items())? - .is_empty() - { - anyhow::bail!( - "datetime string `{}` does not match strptime format `{}`", - date_time_str, - self.borrow_strptime_format() - ); - } - // The parsed datetime contains a date but seems to be missing "time". - // We complete it artificially with 00:00:00. - if parsed.hour_24().is_none() - && !(parsed.hour_12().is_some() && parsed.hour_12_is_pm().is_some()) - { - parsed.set_hour_24(0u8); - parsed.set_minute(0u8); - parsed.set_second(0u8); - } - if parsed.year().is_none() { - let now = OffsetDateTime::now_utc(); - let year = infer_year(parsed.month(), now.month(), now.year()); - parsed.set_year(year); - } - let date_time = parsed.try_into()?; - Ok(date_time) - } - - pub fn parse_date_time(&self, date_time_str: &str) -> Result { - if *self.borrow_with_timezone() { - OffsetDateTime::parse(date_time_str, self.borrow_items()).map_err(|err| err.to_string()) - } else { - self.parse_primitive_date_time(date_time_str) - .map(|date_time| date_time.assume_utc()) - .map_err(|err| err.to_string()) - } - } - - pub fn format_date_time(&self, date_time: &OffsetDateTime) -> Result { - date_time.format(self.borrow_items()) - } -} +use time::Month; -impl Clone for StrptimeParser { - fn clone(&self) -> Self { - // `self.format` is already known to be a valid format. - Self::from_str(self.borrow_strptime_format().as_str()).unwrap() - } -} - -impl PartialEq for StrptimeParser { - fn eq(&self, other: &Self) -> bool { - self.borrow_strptime_format() == other.borrow_strptime_format() - } -} - -impl Eq for StrptimeParser {} - -impl std::fmt::Debug for StrptimeParser { - fn fmt(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - formatter - .debug_struct("StrptimeParser") - .field("format", &self.borrow_strptime_format()) - .finish() - } -} - -impl std::hash::Hash for StrptimeParser { - fn hash(&self, state: &mut H) { - self.borrow_strptime_format().hash(state); - } -} - -// `Strftime` format special characters. -// These characters are taken from the parsing crate we use for compatibility. -const STRFTIME_FORMAT_MARKERS: [&str; 36] = [ - "%a", "%A", "%b", "%B", "%c", "%C", "%d", "%D", "%e", "%f", "%F", "%h", "%H", "%I", "%j", "%k", - "%l", "%m", "%M", "%n", "%p", "%P", "%r", "%R", "%S", "%t", "%T", "%U", "%w", "%W", "%x", "%X", - "%y", "%Y", "%z", "%Z", -]; - -// Checks if a format contains `strftime` special characters. -fn is_strftime_formatting(format_str: &str) -> bool { - STRFTIME_FORMAT_MARKERS - .iter() - .any(|marker| format_str.contains(marker)) -} +use crate::java_date_time_format::is_strftime_formatting; +use crate::{StrptimeParser, TantivyDateTime}; /// Specifies the datetime and unix timestamp formats to use when parsing date strings. #[derive(Clone, Debug, Eq, PartialEq, Hash, Default)] @@ -170,7 +46,7 @@ impl DateTimeInputFormat { DateTimeInputFormat::Iso8601 => "iso8601", DateTimeInputFormat::Rfc2822 => "rfc2822", DateTimeInputFormat::Rfc3339 => "rfc3339", - DateTimeInputFormat::Strptime(parser) => parser.borrow_strptime_format(), + DateTimeInputFormat::Strptime(parser) => parser.strptime_format.as_str(), DateTimeInputFormat::Timestamp => "unix_timestamp", } } @@ -198,7 +74,7 @@ impl FromStr for DateTimeInputFormat { format must contain at least one `strftime` special characters" )); } - DateTimeInputFormat::Strptime(StrptimeParser::from_str(date_time_format_str)?) + DateTimeInputFormat::Strptime(StrptimeParser::from_strptime(date_time_format_str)?) } }; Ok(date_time_format) @@ -241,7 +117,7 @@ impl DateTimeOutputFormat { DateTimeOutputFormat::Iso8601 => "iso8601", DateTimeOutputFormat::Rfc2822 => "rfc2822", DateTimeOutputFormat::Rfc3339 => "rfc3339", - DateTimeOutputFormat::Strptime(parser) => parser.borrow_strptime_format(), + DateTimeOutputFormat::Strptime(parser) => parser.strptime_format.as_str(), DateTimeOutputFormat::TimestampSecs => "unix_timestamp_secs", DateTimeOutputFormat::TimestampMillis => "unix_timestamp_millis", DateTimeOutputFormat::TimestampMicros => "unix_timestamp_micros", @@ -300,7 +176,7 @@ impl FromStr for DateTimeOutputFormat { format must contain at least one `strftime` special characters" )); } - DateTimeOutputFormat::Strptime(StrptimeParser::from_str(date_time_format_str)?) + DateTimeOutputFormat::Strptime(StrptimeParser::from_strptime(date_time_format_str)?) } }; Ok(date_time_format) @@ -341,7 +217,6 @@ pub(super) fn infer_year( #[cfg(test)] mod tests { - use time::macros::datetime; use time::Month; use super::*; @@ -462,20 +337,6 @@ mod tests { } } - #[test] - fn test_strictly_parse_datetime_format() { - let parser = StrptimeParser::from_str("%Y-%m-%d").unwrap(); - assert_eq!( - parser.parse_date_time("2021-01-01").unwrap(), - datetime!(2021-01-01 00:00:00 UTC) - ); - let error = parser.parse_date_time("2021-01-01TABC").unwrap_err(); - assert_eq!( - error, - "datetime string `2021-01-01TABC` does not match strptime format `%Y-%m-%d`" - ); - } - #[test] fn test_infer_year() { let inferred_year = infer_year(None, Month::January, 2024); diff --git a/quickwit/quickwit-datetime/src/date_time_parsing.rs b/quickwit/quickwit-datetime/src/date_time_parsing.rs index 14c1fa9be90..54e8d4b88bb 100644 --- a/quickwit/quickwit-datetime/src/date_time_parsing.rs +++ b/quickwit/quickwit-datetime/src/date_time_parsing.rs @@ -179,8 +179,6 @@ pub fn parse_timestamp(timestamp: i64) -> Result { #[cfg(test)] mod tests { - use std::str::FromStr; - use time::macros::datetime; use time::Month; @@ -262,7 +260,7 @@ mod tests { ), ]; for (fmt, date_time_str, expected) in test_data { - let parser = StrptimeParser::from_str(fmt).unwrap(); + let parser = StrptimeParser::from_strptime(fmt).unwrap(); let result = parser.parse_date_time(date_time_str); if let Err(error) = &result { panic!( @@ -276,14 +274,14 @@ mod tests { #[test] fn test_parse_date_without_time() { - let strptime_parser = StrptimeParser::from_str("%Y-%m-%d").unwrap(); + let strptime_parser = StrptimeParser::from_strptime("%Y-%m-%d").unwrap(); let date = strptime_parser.parse_date_time("2012-05-21").unwrap(); assert_eq!(date, datetime!(2012-05-21 00:00:00 UTC)); } #[test] fn test_parse_date_am_pm_hour_not_zeroed() { - let strptime_parser = StrptimeParser::from_str("%Y-%m-%d %I:%M:%S %p").unwrap(); + let strptime_parser = StrptimeParser::from_strptime("%Y-%m-%d %I:%M:%S %p").unwrap(); let date = strptime_parser .parse_date_time("2012-05-21 10:05:12 pm") .unwrap(); @@ -309,13 +307,13 @@ mod tests { DateTimeInputFormat::Rfc2822, DateTimeInputFormat::Rfc3339, DateTimeInputFormat::Strptime( - StrptimeParser::from_str("%Y-%m-%d %H:%M:%S").unwrap(), + StrptimeParser::from_strptime("%Y-%m-%d %H:%M:%S").unwrap(), ), DateTimeInputFormat::Strptime( - StrptimeParser::from_str("%Y/%m/%d %H:%M:%S").unwrap(), + StrptimeParser::from_strptime("%Y/%m/%d %H:%M:%S").unwrap(), ), DateTimeInputFormat::Strptime( - StrptimeParser::from_str("%Y/%m/%d %H:%M:%S %z").unwrap(), + StrptimeParser::from_strptime("%Y/%m/%d %H:%M:%S %z").unwrap(), ), DateTimeInputFormat::Timestamp, ], @@ -452,7 +450,7 @@ mod tests { DateTimeInputFormat::Iso8601, DateTimeInputFormat::Rfc3339, DateTimeInputFormat::Strptime( - StrptimeParser::from_str("%Y-%m-%d %H:%M:%S.%f").unwrap(), + StrptimeParser::from_strptime("%Y-%m-%d %H:%M:%S.%f").unwrap(), ), ], ) diff --git a/quickwit/quickwit-datetime/src/java_date_time_format.rs b/quickwit/quickwit-datetime/src/java_date_time_format.rs new file mode 100644 index 00000000000..1cc035c90f3 --- /dev/null +++ b/quickwit/quickwit-datetime/src/java_date_time_format.rs @@ -0,0 +1,817 @@ +// Copyright (C) 2024 Quickwit, Inc. +// +// Quickwit is offered under the AGPL v3.0 and as commercial software. +// For commercial licensing, contact us at hello@quickwit.io. +// +// AGPL: +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU Affero General Public License as +// published by the Free Software Foundation, either version 3 of the +// License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Affero General Public License for more details. +// +// You should have received a copy of the GNU Affero General Public License +// along with this program. If not, see . + +use std::collections::HashMap; +use std::num::NonZeroU8; +use std::sync::OnceLock; + +use time::error::{Format, TryFromParsed}; +use time::format_description::modifier::{ + Day, Hour, Minute, Month as MonthModifier, Padding, Second, Subsecond, SubsecondDigits, + WeekNumber, WeekNumberRepr, Weekday, WeekdayRepr, Year, YearRepr, +}; +use time::format_description::{Component, OwnedFormatItem}; +use time::parsing::Parsed; +use time::{Month, OffsetDateTime, PrimitiveDateTime, UtcOffset}; +use time_fmt::parse::time_format_item::parse_to_format_item; + +use crate::date_time_format; + +const JAVA_DATE_FORMAT_TOKENS: &[&str] = &[ + "yyyy", + "xxxx", + "xx[xx]", + "SSSSSSSSS", // For nanoseconds + "SSSSSSS", // For microseconds + "SSSSSS", // For fractional seconds up to six digits + "SSSSS", + "SSSS", + "SSS", + "SS", + "ZZ", + "xx", + "ww", + "w[w]", + "yy", + "MM", + "dd", + "HH", + "hh", + "kk", + "mm", + "ss", + "aa", + "a", + "w", + "M", + "d", + "H", + "h", + "k", + "m", + "s", + "S", + "Z", + "e", +]; + +fn literal(s: &[u8]) -> OwnedFormatItem { + // builds a boxed slice from a slice + let boxed_slice: Box<[u8]> = s.to_vec().into_boxed_slice(); + OwnedFormatItem::Literal(boxed_slice) +} + +#[inline] +fn get_padding(ptn: &str) -> Padding { + if ptn.len() == 2 { + Padding::Zero + } else { + Padding::None + } +} + +fn build_zone_offset(_: &str) -> Option { + // 'Z' literal to represent UTC offset + let z_literal = OwnedFormatItem::Literal(Box::from(b"Z".as_ref())); + + // Offset in '+/-HH:MM' format + let offset_with_delimiter_items: Box<[OwnedFormatItem]> = vec![ + OwnedFormatItem::Component(Component::OffsetHour(Default::default())), + OwnedFormatItem::Literal(Box::from(b":".as_ref())), + OwnedFormatItem::Component(Component::OffsetMinute(Default::default())), + ] + .into_boxed_slice(); + let offset_with_delimiter_compound = OwnedFormatItem::Compound(offset_with_delimiter_items); + + // Offset in '+/-HHMM' format + let offset_items: Box<[OwnedFormatItem]> = vec![ + OwnedFormatItem::Component(Component::OffsetHour(Default::default())), + OwnedFormatItem::Component(Component::OffsetMinute(Default::default())), + ] + .into_boxed_slice(); + let offset_compound = OwnedFormatItem::Compound(offset_items); + + Some(OwnedFormatItem::First( + vec![z_literal, offset_with_delimiter_compound, offset_compound].into_boxed_slice(), + )) +} + +fn build_year_item(ptn: &str) -> Option { + let mut full_year = Year::default(); + full_year.repr = YearRepr::Full; + let full_year_component = OwnedFormatItem::Component(Component::Year(full_year)); + + let mut short_year = Year::default(); + short_year.repr = YearRepr::LastTwo; + let short_year_component = OwnedFormatItem::Component(Component::Year(short_year)); + + if ptn.len() == 4 { + Some(full_year_component) + } else if ptn.len() == 2 { + Some(short_year_component) + } else { + Some(OwnedFormatItem::First( + vec![full_year_component, short_year_component].into_boxed_slice(), + )) + } +} + +fn build_week_based_year_item(ptn: &str) -> Option { + // TODO no `Component` for that + build_year_item(ptn) +} + +fn build_month_item(ptn: &str) -> Option { + let mut month: MonthModifier = Default::default(); + month.padding = get_padding(ptn); + Some(OwnedFormatItem::Component(Component::Month(month))) +} + +fn build_day_item(ptn: &str) -> Option { + let mut day = Day::default(); + day.padding = get_padding(ptn); + Some(OwnedFormatItem::Component(Component::Day(day))) +} + +fn build_day_of_week_item(_: &str) -> Option { + let mut weekday = Weekday::default(); + weekday.repr = WeekdayRepr::Monday; + weekday.one_indexed = false; + Some(OwnedFormatItem::Component(Component::Weekday(weekday))) +} + +fn build_week_of_year_item(ptn: &str) -> Option { + let mut week_number = WeekNumber::default(); + week_number.repr = WeekNumberRepr::Monday; + week_number.padding = get_padding(ptn); + Some(OwnedFormatItem::Component(Component::WeekNumber( + week_number, + ))) +} + +fn build_hour_item(ptn: &str) -> Option { + let mut hour = Hour::default(); + hour.padding = get_padding(ptn); + hour.is_12_hour_clock = false; + Some(OwnedFormatItem::Component(Component::Hour(hour))) +} + +fn build_minute_item(ptn: &str) -> Option { + let mut minute: Minute = Default::default(); + minute.padding = get_padding(ptn); + Some(OwnedFormatItem::Component(Component::Minute(minute))) +} + +fn build_second_item(ptn: &str) -> Option { + let mut second: Second = Default::default(); + second.padding = get_padding(ptn); + Some(OwnedFormatItem::Component(Component::Second(second))) +} + +fn build_fraction_of_second_item(_ptn: &str) -> Option { + let mut subsecond: Subsecond = Default::default(); + subsecond.digits = SubsecondDigits::OneOrMore; + Some(OwnedFormatItem::Component(Component::Subsecond(subsecond))) +} + +fn parse_java_datetime_format_items_recursive( + chars: &mut std::iter::Peekable, +) -> Result, String> { + let mut items = Vec::new(); + + while let Some(&c) = chars.peek() { + match c { + '[' => { + chars.next(); + let optional_items = parse_java_datetime_format_items_recursive(chars)?; + items.push(OwnedFormatItem::Optional(Box::new( + OwnedFormatItem::Compound(optional_items.into_boxed_slice()), + ))); + } + ']' => { + chars.next(); + break; + } + '\'' => { + chars.next(); + let mut literal_str = String::new(); + while let Some(&next_c) = chars.peek() { + if next_c == '\'' { + chars.next(); + break; + } else { + literal_str.push(next_c); + chars.next(); + } + } + items.push(literal(literal_str.as_bytes())); + } + _ => { + if let Some(format_item) = match_java_date_format_token(chars)? { + items.push(format_item); + } else { + // Treat as a literal character + items.push(literal(c.to_string().as_bytes())); + chars.next(); + } + } + } + } + + Ok(items) +} + +// Elasticsearch/OpenSearch uses a set of preconfigured formats, more information could be found +// here https://www.elastic.co/guide/en/elasticsearch/reference/current/mapping-date-format.html +fn match_java_date_format_token( + chars: &mut std::iter::Peekable, +) -> Result, String> { + if chars.peek().is_none() { + return Ok(None); + } + + let remaining: String = chars.clone().collect(); + + // Try to match the longest possible token + for token in JAVA_DATE_FORMAT_TOKENS { + if remaining.starts_with(token) { + for _ in 0..token.len() { + chars.next(); + } + + let format_item = match *token { + "yyyy" | "yy" => build_year_item(token), + "xxxx" | "xx[xx]" | "xx" => build_week_based_year_item(token), + "MM" | "M" => build_month_item(token), + "dd" | "d" => build_day_item(token), + "HH" | "H" => build_hour_item(token), + "mm" | "m" => build_minute_item(token), + "ss" | "s" => build_second_item(token), + "SSSSSSSSS" | "SSSSSSS" | "SSSSSS" | "SSSSS" | "SSSS" | "SSS" | "SS" | "S" => { + build_fraction_of_second_item(token) + } + "Z" => build_zone_offset(token), + "ww" | "w[w]" | "w" => build_week_of_year_item(token), + "e" => build_day_of_week_item(token), + _ => return Err(format!("Unrecognized token '{}'", token)), + }; + return Ok(format_item); + } + } + + Ok(None) +} + +// Check if the given date time format is a common alias and replace it with the +// Java date format it is mapped to, if any. +// If the java_datetime_format is not an alias, it is expected to be a +// java date time format and should be returned as is. +fn resolve_java_datetime_format_alias(java_datetime_format: &str) -> &str { + static JAVA_DATE_FORMAT_ALIASES: OnceLock> = + OnceLock::new(); + let java_datetime_format_map = JAVA_DATE_FORMAT_ALIASES.get_or_init(|| { + let mut m = HashMap::new(); + m.insert("date_optional_time", "yyyy-MM-dd['T'HH:mm:ss.SSSZ]"); + m.insert( + "strict_date_optional_time", + "yyyy[-MM[-dd['T'HH[:mm[:ss[.SSS[Z]]]]]]]", + ); + m.insert( + "strict_date_optional_time_nanos", + "yyyy[-MM[-dd['T'HH:mm:ss.SSSSSSZ]]]", + ); + m.insert("basic_date", "yyyyMMdd"); + + m.insert("strict_basic_week_date", "xxxx'W'wwe"); + m.insert("basic_week_date", "xx[xx]'W'wwe"); + + m.insert("strict_basic_week_date_time", "xxxx'W'wwe'T'HHmmss.SSSZ"); + m.insert("basic_week_date_time", "xx[xx]'W'wwe'T'HHmmss.SSSZ"); + + m.insert( + "strict_basic_week_date_time_no_millis", + "xxxx'W'wwe'T'HHmmssZ", + ); + m.insert("basic_week_date_time_no_millis", "xx[xx]'W'wwe'T'HHmmssZ"); + + m.insert("strict_week_date", "xxxx-'W'ww-e"); + m.insert("week_date", "xxxx-'W'w[w]-e"); + m + }); + java_datetime_format_map + .get(java_datetime_format) + .copied() + .unwrap_or(java_datetime_format) +} + +/// A date time parser that holds the format specification `Vec`. +#[derive(Clone)] +pub struct StrptimeParser { + pub(crate) strptime_format: String, + items: Box<[OwnedFormatItem]>, +} + +pub fn parse_java_datetime_format_items( + java_datetime_format: &str, +) -> Result, String> { + let mut chars = java_datetime_format.chars().peekable(); + let items = parse_java_datetime_format_items_recursive(&mut chars)?; + Ok(items.into_boxed_slice()) +} + +impl StrptimeParser { + /// Parse a date assume UTC if unspecified. + /// See `parse_date_time_with_default_timezone` for more details. + pub fn parse_date_time(&self, date_time_str: &str) -> Result { + self.parse_date_time_with_default_timezone(date_time_str, UtcOffset::UTC) + } + + /// Parse a date. If no timezone is specified we will assume the timezone passed as + /// `default_offset`. If the date is missing, it will be automatically set to 00:00:00. + pub fn parse_date_time_with_default_timezone( + &self, + date_time_str: &str, + default_offset: UtcOffset, + ) -> Result { + let mut parsed = Parsed::new(); + if !parsed + .parse_items(date_time_str.as_bytes(), &self.items) + .map_err(|err| err.to_string())? + .is_empty() + { + return Err(format!( + "datetime string `{}` does not match strptime format `{}`", + date_time_str, &self.strptime_format + )); + } + + // The parsed datetime contains a date but seems to be missing "time". + // We complete it artificially with 00:00:00. + if parsed.hour_24().is_none() + && !(parsed.hour_12().is_some() && parsed.hour_12_is_pm().is_some()) + { + parsed.set_hour_24(0u8); + parsed.set_minute(0u8); + parsed.set_second(0u8); + } + + if parsed.year().is_none() { + let now = OffsetDateTime::now_utc(); + let year = date_time_format::infer_year(parsed.month(), now.month(), now.year()); + parsed.set_year(year); + } + + if parsed.day().is_none() && parsed.monday_week_number().is_none() { + parsed.set_day(NonZeroU8::try_from(1u8).unwrap()); + } + + if parsed.month().is_none() && parsed.monday_week_number().is_none() { + parsed.set_month(Month::January); + } + + if parsed.offset_hour().is_some() { + let offset_datetime: OffsetDateTime = parsed + .try_into() + .map_err(|err: TryFromParsed| err.to_string())?; + return Ok(offset_datetime); + } + let primitive_date_time: PrimitiveDateTime = parsed + .try_into() + .map_err(|err: TryFromParsed| err.to_string())?; + Ok(primitive_date_time.assume_offset(default_offset)) + } + + pub fn format_date_time(&self, date_time: &OffsetDateTime) -> Result { + date_time.format(&self.items) + } + + pub fn from_strptime(strptime_format: &str) -> Result { + let items: Box<[OwnedFormatItem]> = parse_to_format_item(strptime_format) + .map_err(|err| format!("invalid strptime format `{strptime_format}`: {err}"))? + .into_iter() + .map(|item| item.into()) + .collect::>() + .into_boxed_slice(); + Ok(StrptimeParser::new(strptime_format.to_string(), items)) + } + + pub fn from_java_datetime_format(java_datetime_format: &str) -> Result { + let java_datetime_format_resolved = + resolve_java_datetime_format_alias(java_datetime_format); + let items: Box<[OwnedFormatItem]> = + parse_java_datetime_format_items(java_datetime_format_resolved)?; + Ok(StrptimeParser::new(java_datetime_format.to_string(), items)) + } + + fn new(strptime_format: String, items: Box<[OwnedFormatItem]>) -> Self { + StrptimeParser { + strptime_format, + items, + } + } +} + +impl PartialEq for StrptimeParser { + fn eq(&self, other: &Self) -> bool { + self.strptime_format == other.strptime_format + } +} + +impl Eq for StrptimeParser {} + +impl std::fmt::Debug for StrptimeParser { + fn fmt(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result { + formatter + .debug_struct("StrptimeParser") + .field("format", &self.strptime_format) + .finish() + } +} + +impl std::hash::Hash for StrptimeParser { + fn hash(&self, state: &mut H) { + self.strptime_format.hash(state); + } +} + +// `Strftime` format special characters. +// These characters are taken from the parsing crate we use for compatibility. +const STRFTIME_FORMAT_MARKERS: [&str; 36] = [ + "%a", "%A", "%b", "%B", "%c", "%C", "%d", "%D", "%e", "%f", "%F", "%h", "%H", "%I", "%j", "%k", + "%l", "%m", "%M", "%n", "%p", "%P", "%r", "%R", "%S", "%t", "%T", "%U", "%w", "%W", "%x", "%X", + "%y", "%Y", "%z", "%Z", +]; + +// Checks if a format contains `strftime` special characters. +pub fn is_strftime_formatting(format_str: &str) -> bool { + STRFTIME_FORMAT_MARKERS + .iter() + .any(|marker| format_str.contains(marker)) +} + +#[cfg(test)] +mod tests { + use time::macros::datetime; + + use super::*; + use crate::java_date_time_format::parse_java_datetime_format_items; + + #[test] + fn test_parse_datetime_format_missing_time() { + let parser = StrptimeParser::from_strptime("%Y-%m-%d").unwrap(); + assert_eq!( + parser.parse_date_time("2021-01-01").unwrap(), + datetime!(2021-01-01 00:00:00 UTC) + ); + } + + #[test] + fn test_parse_datetime_format_strict_on_trailing_data() { + let parser = StrptimeParser::from_strptime("%Y-%m-%d").unwrap(); + let error = parser.parse_date_time("2021-01-01TABC").unwrap_err(); + assert_eq!( + error, + "datetime string `2021-01-01TABC` does not match strptime format `%Y-%m-%d`" + ); + } + + #[test] + fn test_parse_strptime_with_timezone() { + let parser = StrptimeParser::from_strptime("%Y-%m-%dT%H:%M:%S %z").unwrap(); + let offset_datetime = parser + .parse_date_time("2021-01-01T11:00:03 +07:00") + .unwrap(); + assert_eq!(offset_datetime, datetime!(2021-01-01 11:00:03 +7)); + } + + #[track_caller] + fn test_parse_java_datetime_aux( + java_date_time_format: &str, + date_str: &str, + expected_datetime: OffsetDateTime, + ) { + let parser = StrptimeParser::from_java_datetime_format(java_date_time_format).unwrap(); + let datetime = parser.parse_date_time(date_str).unwrap(); + assert_eq!(datetime, expected_datetime); + } + + #[test] + fn test_parse_java_datetime_format() { + test_parse_java_datetime_aux("yyyyMMdd", "20210101", datetime!(2021-01-01 00:00:00 UTC)); + test_parse_java_datetime_aux( + "yyyy MM dd", + "2021 01 01", + datetime!(2021-01-01 00:00:00 UTC), + ); + test_parse_java_datetime_aux( + "yyyy!MM?dd", + "2021!01?01", + datetime!(2021-01-01 00:00:00 UTC), + ); + test_parse_java_datetime_aux( + "yyyy!MM?dd'T'HH:", + "2021!01?01T13:", + datetime!(2021-01-01 13:00:00 UTC), + ); + test_parse_java_datetime_aux( + "yyyy!MM?dd['T'[HH:]]", + "2021!01?01", + datetime!(2021-01-01 00:00:00 UTC), + ); + test_parse_java_datetime_aux( + "yyyy!MM?dd['T'[HH:]", + "2021!01?01T", + datetime!(2021-01-01 00:00:00 UTC), + ); + test_parse_java_datetime_aux( + "yyyy!MM?dd['T'[HH:]]", + "2021!01?01T13:", + datetime!(2021-01-01 13:00:00 UTC), + ); + } + + #[test] + fn test_parse_java_missing_time() { + test_parse_java_datetime_aux( + "yyyy-MM-dd", + "2021-01-01", + datetime!(2021-01-01 00:00:00 UTC), + ); + } + + #[test] + fn test_parse_java_optional_missing_time() { + test_parse_java_datetime_aux( + "yyyy-MM-dd[ HH:mm:ss]", + "2021-01-01", + datetime!(2021-01-01 00:00:00 UTC), + ); + test_parse_java_datetime_aux( + "yyyy-MM-dd[ HH:mm:ss]", + "2021-01-01 12:34:56", + datetime!(2021-01-01 12:34:56 UTC), + ); + } + + #[test] + fn test_parse_java_datetime_format_aliases() { + test_parse_java_datetime_aux( + "date_optional_time", + "2021-01-01", + datetime!(2021-01-01 00:00:00 UTC), + ); + test_parse_java_datetime_aux( + "date_optional_time", + "2021-01-21T03:01:22.312+01:00", + datetime!(2021-01-21 03:01:22.312 +1), + ); + } + + #[test] + fn test_parse_java_week_formats() { + test_parse_java_datetime_aux( + "basic_week_date", + "2024W313", + datetime!(2024-08-01 0:00:00.0 +00:00:00), + ); + test_parse_java_datetime_aux( + "basic_week_date", + "24W313", + datetime!(2024-08-01 0:00:00.0 +00:00:00), + ); + // // ❌ 'the 'year' component could not be parsed' + // test_parse_java_datetime_aux( + // "basic_week_date", + // "1W313", + // datetime!(2018-08-02 0:00:00.0 +00:00:00), + // ); + test_parse_java_datetime_aux( + "basic_week_date_time", + "2018W313T121212.1Z", + datetime!(2018-08-02 12:12:12.1 +00:00:00), + ); + test_parse_java_datetime_aux( + "basic_week_date_time", + "2018W313T121212.123Z", + datetime!(2018-08-02 12:12:12.123 +00:00:00), + ); + test_parse_java_datetime_aux( + "basic_week_date_time", + "2018W313T121212.123456789Z", + datetime!(2018-08-02 12:12:12.123456789 +00:00:00), + ); + test_parse_java_datetime_aux( + "basic_week_date_time", + "2018W313T121212.123+0100", + datetime!(2018-08-02 12:12:12.123 +01:00:00), + ); + test_parse_java_datetime_aux( + "basic_week_date_time_no_millis", + "2018W313T121212Z", + datetime!(2018-08-02 12:12:12.0 +00:00:00), + ); + test_parse_java_datetime_aux( + "basic_week_date_time_no_millis", + "2018W313T121212+0100", + datetime!(2018-08-02 12:12:12.0 +01:00:00), + ); + test_parse_java_datetime_aux( + "basic_week_date_time_no_millis", + "2018W313T121212+01:00", + datetime!(2018-08-02 12:12:12.0 +01:00:00), + ); + + test_parse_java_datetime_aux( + "week_date", + "2012-W48-6", + datetime!(2012-12-02 0:00:00.0 +00:00:00), + ); + + test_parse_java_datetime_aux( + "week_date", + "2012-W01-6", + datetime!(2012-01-08 0:00:00.0 +00:00:00), + ); + + test_parse_java_datetime_aux( + "week_date", + "2012-W1-6", + datetime!(2012-01-08 0:00:00.0 +00:00:00), + ); + } + + #[test] + fn test_parse_java_strict_week_formats() { + test_parse_java_datetime_aux( + "strict_basic_week_date", + "2024W313", + datetime!(2024-08-01 0:00:00.0 +00:00:00), + ); + + test_parse_java_datetime_aux( + "strict_week_date", + "2012-W48-6", + datetime!(2012-12-02 0:00:00.0 +00:00:00), + ); + + test_parse_java_datetime_aux( + "strict_week_date", + "2012-W01-6", + datetime!(2012-01-08 0:00:00.0 +00:00:00), + ); + } + + #[test] + fn test_parse_strict_date_optional_time() { + let parser = + StrptimeParser::from_java_datetime_format("strict_date_optional_time").unwrap(); + let dates = [ + "2019", + "2019-03", + "2019-03-23", + "2019-03-23T21:34", + "2019-03-23T21:34:46", + "2019-03-23T21:34:46.123Z", + "2019-03-23T21:35:46.123+00:00", + "2019-03-23T21:36:46.123+03:00", + "2019-03-23T21:37:46.123+0300", + ]; + let expected = [ + datetime!(2019-01-01 00:00:00 UTC), + datetime!(2019-03-01 00:00:00 UTC), + datetime!(2019-03-23 00:00:00 UTC), + datetime!(2019-03-23 21:34 UTC), + datetime!(2019-03-23 21:34:46 UTC), + datetime!(2019-03-23 21:34:46.123 UTC), + datetime!(2019-03-23 21:35:46.123 UTC), + datetime!(2019-03-23 21:36:46.123 +03:00:00), + datetime!(2019-03-23 21:37:46.123 +03:00:00), + ]; + for (date_str, &expected_dt) in dates.iter().zip(expected.iter()) { + let parsed_dt = parser + .parse_date_time(date_str) + .unwrap_or_else(|e| panic!("Failed to parse {}: {}", date_str, e)); + assert_eq!(parsed_dt, expected_dt); + } + } + + #[test] + fn test_parse_strict_date_optional_time_nanos() { + let parser = + StrptimeParser::from_java_datetime_format("strict_date_optional_time_nanos").unwrap(); + let dates = [ + "2019", + "2019-03", + "2019-03-23", + "2019-03-23T21:34:46.123456789Z", + "2019-03-23T21:35:46.123456789+00:00", + "2019-03-23T21:36:46.123456789+03:00", + "2019-03-23T21:37:46.123456789+0300", + ]; + let expected = [ + datetime!(2019-01-01 00:00:00 UTC), + datetime!(2019-03-01 00:00:00 UTC), + datetime!(2019-03-23 00:00:00 UTC), + datetime!(2019-03-23 21:34:46.123456789 UTC), + datetime!(2019-03-23 21:35:46.123456789 UTC), + datetime!(2019-03-23 21:36:46.123456789 +03:00:00), + datetime!(2019-03-23 21:37:46.123456789 +03:00:00), + ]; + for (date_str, &expected_dt) in dates.iter().zip(expected.iter()) { + let parsed_dt = parser + .parse_date_time(date_str) + .unwrap_or_else(|e| panic!("Failed to parse {}: {}", date_str, e)); + assert_eq!(parsed_dt, expected_dt); + } + } + + #[test] + fn test_parse_java_datetime_format_items() { + let format_str = "xx[xx]'W'wwe"; + let result = parse_java_datetime_format_items(format_str).unwrap(); + + // We expect the tokens to be parsed as: + // - 'xx[xx]' (week-based year) with optional length + // - 'W' (literal) + // - 'ww' (week of year) + // - 'e' (day of week) + + assert_eq!(result.len(), 4); + + // Verify each token + match &result[0] { + OwnedFormatItem::First(boxed_slice) => { + assert_eq!(boxed_slice.len(), 2); + match (&boxed_slice[0], &boxed_slice[1]) { + ( + OwnedFormatItem::Component(Component::Year(_)), + OwnedFormatItem::Component(Component::Year(_)), + ) => {} + unexpected => { + panic!("Expected two Year components, but found: {:?}", unexpected) + } + } + } + unexpected => panic!( + "Expected First with two Year components, but found: {:?}", + unexpected + ), + } + + match &result[1] { + OwnedFormatItem::Literal(lit) => assert_eq!(lit.as_ref(), b"W"), + unexpected => panic!("Expected literal 'W', but found: {:?}", unexpected), + } + + match &result[2] { + OwnedFormatItem::Component(Component::WeekNumber(_)) => {} + unexpected => panic!("Expected WeekNumber component, but found: {:?}", unexpected), + } + + match &result[3] { + OwnedFormatItem::Component(Component::Weekday(_)) => {} + unexpected => panic!("Expected Weekday component, but found: {:?}", unexpected), + } + } + + #[test] + fn test_parse_java_datetime_format_with_literals() { + let format = "yyyy'T'Z-HHuu"; + let parser = StrptimeParser::from_java_datetime_format(format).unwrap(); + + let test_cases = [ + ("2023TZ-14uu", datetime!(2023-01-01 14:00:00 UTC)), + ("2024TZ-05uu", datetime!(2024-01-01 05:00:00 UTC)), + ("2025TZ-23uu", datetime!(2025-01-01 23:00:00 UTC)), + ]; + + for (input, expected) in test_cases.iter() { + let result = parser.parse_date_time(input).unwrap(); + assert_eq!(result, *expected, "Failed to parse {}", input); + } + + // Test error case + let error_case = "2023-1430"; + assert!( + parser.parse_date_time(error_case).is_err(), + "Expected error for input: {}", + error_case + ); + } +} diff --git a/quickwit/quickwit-datetime/src/lib.rs b/quickwit/quickwit-datetime/src/lib.rs index eb4d8c940ba..03003641dcc 100644 --- a/quickwit/quickwit-datetime/src/lib.rs +++ b/quickwit/quickwit-datetime/src/lib.rs @@ -19,9 +19,11 @@ mod date_time_format; mod date_time_parsing; +pub mod java_date_time_format; -pub use date_time_format::{DateTimeInputFormat, DateTimeOutputFormat, StrptimeParser}; +pub use date_time_format::{DateTimeInputFormat, DateTimeOutputFormat}; pub use date_time_parsing::{ parse_date_time_str, parse_timestamp, parse_timestamp_float, parse_timestamp_int, }; +pub use java_date_time_format::StrptimeParser; pub use tantivy::DateTime as TantivyDateTime; diff --git a/quickwit/quickwit-query/Cargo.toml b/quickwit/quickwit-query/Cargo.toml index e1229da8e8b..bee650198c8 100644 --- a/quickwit/quickwit-query/Cargo.toml +++ b/quickwit/quickwit-query/Cargo.toml @@ -22,6 +22,7 @@ serde = { workspace = true } serde_json = { workspace = true } serde_with = { workspace = true } tantivy = { workspace = true } +time = { workspace = true } thiserror = { workspace = true } whichlang = { workspace = true, optional = true } diff --git a/quickwit/quickwit-query/src/elastic_query_dsl/range_query.rs b/quickwit/quickwit-query/src/elastic_query_dsl/range_query.rs index 9e7d07e23da..337ec019e9d 100644 --- a/quickwit/quickwit-query/src/elastic_query_dsl/range_query.rs +++ b/quickwit/quickwit-query/src/elastic_query_dsl/range_query.rs @@ -18,10 +18,10 @@ // along with this program. If not, see . use std::ops::Bound; -use std::str::FromStr; use quickwit_datetime::StrptimeParser; use serde::Deserialize; +use time::format_description::well_known::Rfc3339; use crate::elastic_query_dsl::one_field_map::OneFieldMap; use crate::elastic_query_dsl::ConvertibleToQueryAst; @@ -59,10 +59,9 @@ impl ConvertibleToQueryAst for RangeQuery { boost, format, } = self.value; - let (gt, gte, lt, lte) = if let Some(JsonLiteral::String(fmt)) = format { - let parser = StrptimeParser::from_str(&fmt).map_err(|reason| { - anyhow::anyhow!("failed to create parser from : {}; reason: {}", fmt, reason) - })?; + let (gt, gte, lt, lte) = if let Some(JsonLiteral::String(java_date_format)) = format { + let parser = StrptimeParser::from_java_datetime_format(&java_date_format) + .map_err(|err| anyhow::anyhow!("failed to parse range query date format. {err}"))?; ( gt.map(|v| parse_and_convert(v, &parser)).transpose()?, gte.map(|v| parse_and_convert(v, &parser)).transpose()?, @@ -102,7 +101,8 @@ fn parse_and_convert(literal: JsonLiteral, parser: &StrptimeParser) -> anyhow::R let parsed_date_time = parser .parse_date_time(&date_time_str) .map_err(|reason| anyhow::anyhow!("Failed to parse date time: {}", reason))?; - Ok(JsonLiteral::String(parsed_date_time.to_string())) + let parsed_date_time_rfc3339 = parsed_date_time.format(&Rfc3339)?; + Ok(JsonLiteral::String(parsed_date_time_rfc3339)) } else { Ok(literal) } @@ -110,39 +110,62 @@ fn parse_and_convert(literal: JsonLiteral, parser: &StrptimeParser) -> anyhow::R #[cfg(test)] mod tests { - use std::str::FromStr; + use std::ops::Bound; - use quickwit_datetime::StrptimeParser; - - use crate::elastic_query_dsl::range_query::parse_and_convert; + use super::{RangeQuery as ElasticRangeQuery, RangeQueryParams as ElasticRangeQueryParams}; + use crate::elastic_query_dsl::ConvertibleToQueryAst; + use crate::query_ast::{QueryAst, RangeQuery}; use crate::JsonLiteral; #[test] - fn test_parse_and_convert() -> anyhow::Result<()> { - let parser = StrptimeParser::from_str("%Y-%m-%d %H:%M:%S").unwrap(); - - // valid datetime - let input = JsonLiteral::String("2022-12-30 05:45:00".to_string()); - let result = parse_and_convert(input, &parser)?; - assert_eq!( - result, - JsonLiteral::String("2022-12-30 5:45:00.0 +00:00:00".to_string()) - ); - - // invalid datetime - let input = JsonLiteral::String("invalid datetime".to_string()); - let result = parse_and_convert(input, &parser); - assert!(result.is_err()); - assert!(result - .unwrap_err() - .to_string() - .contains("Failed to parse date time")); - - // non_string(number) input - let input = JsonLiteral::Number(27.into()); - let result = parse_and_convert(input.clone(), &parser)?; - assert_eq!(result, input); + fn test_date_range_query_with_format() { + let range_query_params = ElasticRangeQueryParams { + gt: Some(JsonLiteral::String("2021-01-03T13:32:43".to_string())), + gte: None, + lt: None, + lte: None, + boost: None, + format: JsonLiteral::String("yyyy-MM-dd['T'HH:mm:ss]".to_string()).into(), + }; + let range_query: ElasticRangeQuery = ElasticRangeQuery { + field: "date".to_string(), + value: range_query_params, + }; + let range_query_ast = range_query.convert_to_query_ast().unwrap(); + assert!(matches!( + range_query_ast, + QueryAst::Range(RangeQuery { + field, + lower_bound: Bound::Excluded(lower_bound), + upper_bound: Bound::Unbounded, + }) + if field == "date" && lower_bound == JsonLiteral::String("2021-01-03T13:32:43Z".to_string()) + )); + } - Ok(()) + #[test] + fn test_date_range_query_with_strict_date_optional_time_format() { + let range_query_params = ElasticRangeQueryParams { + gt: None, + gte: None, + lt: None, + lte: Some(JsonLiteral::String("2024-09-28T10:22:55.797Z".to_string())), + boost: None, + format: JsonLiteral::String("strict_date_optional_time".to_string()).into(), + }; + let range_query: ElasticRangeQuery = ElasticRangeQuery { + field: "timestamp".to_string(), + value: range_query_params, + }; + let range_query_ast = range_query.convert_to_query_ast().unwrap(); + assert!(matches!( + range_query_ast, + QueryAst::Range(RangeQuery { + field, + lower_bound: Bound::Unbounded, + upper_bound: Bound::Included(upper_bound), + }) + if field == "timestamp" && upper_bound == JsonLiteral::String("2024-09-28T10:22:55.797Z".to_string()) + )); } } diff --git a/quickwit/rest-api-tests/scenarii/es_compatibility/0007-range_queries.yaml b/quickwit/rest-api-tests/scenarii/es_compatibility/0007-range_queries.yaml index 5337325c229..bbedea70e0d 100644 --- a/quickwit/rest-api-tests/scenarii/es_compatibility/0007-range_queries.yaml +++ b/quickwit/rest-api-tests/scenarii/es_compatibility/0007-range_queries.yaml @@ -243,5 +243,18 @@ expected: total: value: 68 relation: "eq" - - +--- +# Timestamp field with a custom format. +json: + query: + range: + created_at: + gte: "2015|02|01 T00:00:00.001999Z" + lte: "2015|02|01 T00:00:00.001999Z" + # Elasticsearch date format requires text to be escaped with single quotes + format: yyyy|MM|dd 'T'HH:mm:ss.SSSSSS'Z' +expected: + hits: + total: + value: 1 + relation: "eq"