diff --git a/CHANGELOG.md b/CHANGELOG.md index b0c9a47bba..f03bf345ba 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,10 @@ +Tantivy 0.19 +================================ +- Updated [Date Field Type](https://github.com/quickwit-oss/tantivy/pull/1396) + The `DateTime` type has been updated to hold timestamps with microseconds precision. + `DateOptions` and `DatePrecision` have been added to configure Date fields. The precision is used to hint on fast values compression. Otherwise, seconds precision is used everywhere else (i.e terms, indexing). + + Tantivy 0.18 ================================ - For date values `chrono` has been replaced with `time` (@uklotzde) #1304 : diff --git a/Cargo.toml b/Cargo.toml index 4cb2b6db86..6ecc4a5d4b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -49,7 +49,7 @@ thiserror = "1.0.30" htmlescape = "0.3.1" fail = "0.5.0" murmurhash32 = "0.2.0" -time = { version = "0.3.9", features = ["serde-well-known"] } +time = { version = "0.3.10", features = ["serde-well-known"] } smallvec = "1.8.0" rayon = "1.5.2" lru = "0.7.5" diff --git a/examples/date_time_field.rs b/examples/date_time_field.rs new file mode 100644 index 0000000000..4381ed34cb --- /dev/null +++ b/examples/date_time_field.rs @@ -0,0 +1,69 @@ +// # DateTime field example +// +// This example shows how the DateTime field can be used + +use tantivy::collector::TopDocs; +use tantivy::query::QueryParser; +use tantivy::schema::{Cardinality, DateOptions, Schema, Value, INDEXED, STORED, STRING}; +use tantivy::Index; + +fn main() -> tantivy::Result<()> { + // # Defining the schema + let mut schema_builder = Schema::builder(); + let opts = DateOptions::from(INDEXED) + .set_stored() + .set_fast(Cardinality::SingleValue) + .set_precision(tantivy::DatePrecision::Seconds); + let occurred_at = schema_builder.add_date_field("occurred_at", opts); + let event_type = schema_builder.add_text_field("event", STRING | STORED); + let schema = schema_builder.build(); + + // # Indexing documents + let index = Index::create_in_ram(schema.clone()); + + let mut index_writer = index.writer(50_000_000)?; + let doc = schema.parse_document( + r#"{ + "occurred_at": "2022-06-22T12:53:50.53Z", + "event": "pull-request" + }"#, + )?; + index_writer.add_document(doc)?; + let doc = schema.parse_document( + r#"{ + "occurred_at": "2022-06-22T13:00:00.22Z", + "event": "comment" + }"#, + )?; + index_writer.add_document(doc)?; + index_writer.commit()?; + + let reader = index.reader()?; + let searcher = reader.searcher(); + + // # Default fields: event_type + let query_parser = QueryParser::for_index(&index, vec![event_type]); + { + let query = query_parser.parse_query("event:comment")?; + let count_docs = searcher.search(&*query, &TopDocs::with_limit(5))?; + assert_eq!(count_docs.len(), 1); + } + { + let query = query_parser + .parse_query(r#"occurred_at:[2022-06-22T12:58:00Z TO 2022-06-23T00:00:00Z}"#)?; + let count_docs = searcher.search(&*query, &TopDocs::with_limit(4))?; + assert_eq!(count_docs.len(), 1); + for (_score, doc_address) in count_docs { + let retrieved_doc = searcher.doc(doc_address)?; + assert!(matches!( + retrieved_doc.get_first(occurred_at), + Some(Value::Date(_)) + )); + assert_eq!( + schema.to_json(&retrieved_doc), + r#"{"event":["comment"],"occurred_at":["2022-06-22T13:00:00.22Z"]}"# + ); + } + } + Ok(()) +} diff --git a/fastfield_codecs/src/bitpacked.rs b/fastfield_codecs/src/bitpacked.rs index e09f733035..5509a78a7a 100644 --- a/fastfield_codecs/src/bitpacked.rs +++ b/fastfield_codecs/src/bitpacked.rs @@ -14,7 +14,7 @@ pub struct BitpackedFastFieldReader { pub max_value_u64: u64, } -impl<'data> FastFieldCodecReader for BitpackedFastFieldReader { +impl FastFieldCodecReader for BitpackedFastFieldReader { /// Opens a fast field given a file. fn open_from_bytes(bytes: &[u8]) -> io::Result { let (_data, mut footer) = bytes.split_at(bytes.len() - 16); diff --git a/query-grammar/src/query_grammar.rs b/query-grammar/src/query_grammar.rs index 4f05fbbea5..a26354653e 100644 --- a/query-grammar/src/query_grammar.rs +++ b/query-grammar/src/query_grammar.rs @@ -575,7 +575,7 @@ mod test { for special_char in SPECIAL_CHARS.iter() { let query = &format!("\\{special_char}my\\{special_char}field:a"); assert_eq!( - super::field_name().parse(&query), + super::field_name().parse(query), Ok((format!("{special_char}my{special_char}field"), "a")) ); } diff --git a/src/aggregation/intermediate_agg_result.rs b/src/aggregation/intermediate_agg_result.rs index b6b38bfde0..8e03ad15ac 100644 --- a/src/aggregation/intermediate_agg_result.rs +++ b/src/aggregation/intermediate_agg_result.rs @@ -36,7 +36,10 @@ pub struct IntermediateAggregationResults { impl IntermediateAggregationResults { /// Convert intermediate result and its aggregation request to the final result. - pub fn into_final_bucket_result(self, req: Aggregations) -> crate::Result { + pub(crate) fn into_final_bucket_result( + self, + req: Aggregations, + ) -> crate::Result { self.into_final_bucket_result_internal(&(req.into())) } diff --git a/src/collector/histogram_collector.rs b/src/collector/histogram_collector.rs index 22956a86a2..c4dfba59a0 100644 --- a/src/collector/histogram_collector.rs +++ b/src/collector/histogram_collector.rs @@ -72,8 +72,7 @@ impl HistogramComputer { return; } let delta = value - self.min_value; - let delta_u64 = delta.to_u64(); - let bucket_id: usize = self.divider.divide(delta_u64) as usize; + let bucket_id: usize = self.divider.divide(delta) as usize; if bucket_id < self.counts.len() { self.counts[bucket_id] += 1; } @@ -287,7 +286,7 @@ mod tests { DateTime::from_primitive( Date::from_calendar_date(1980, Month::January, 1)?.with_hms(0, 0, 0)?, ), - 3600 * 24 * 365, // it is just for a unit test... sorry leap years. + 3_600_000_000 * 24 * 365, // it is just for a unit test... sorry leap years. 10, ); let week_histogram = searcher.search(&all_query, &week_histogram_collector)?; diff --git a/src/fastfield/mod.rs b/src/fastfield/mod.rs index c691897476..004a5328e6 100644 --- a/src/fastfield/mod.rs +++ b/src/fastfield/mod.rs @@ -52,11 +52,13 @@ pub trait MultiValueLength { fn get_total_len(&self) -> u64; } -/// Trait for types that are allowed for fast fields: (u64, i64 and f64). +/// Trait for types that are allowed for fast fields: +/// (u64, i64 and f64, bool, DateTime). pub trait FastValue: Clone + Copy + Send + Sync + PartialOrd + 'static { /// Converts a value from u64 /// /// Internally all fast field values are encoded as u64. + /// **Note: To be used for converting encoded Term, Posting values.** fn from_u64(val: u64) -> Self; /// Converts a value to u64. @@ -189,24 +191,27 @@ impl FastValue for bool { } impl FastValue for DateTime { - fn from_u64(timestamp_u64: u64) -> Self { - let unix_timestamp = i64::from_u64(timestamp_u64); - Self::from_unix_timestamp(unix_timestamp) + /// Converts a timestamp microseconds into DateTime. + /// + /// **Note the timestamps is expected to be in microseconds.** + fn from_u64(timestamp_micros_u64: u64) -> Self { + let timestamp_micros = i64::from_u64(timestamp_micros_u64); + Self::from_timestamp_micros(timestamp_micros) } fn to_u64(&self) -> u64 { - self.into_unix_timestamp().to_u64() + common::i64_to_u64(self.into_timestamp_micros()) } fn fast_field_cardinality(field_type: &FieldType) -> Option { match *field_type { - FieldType::Date(ref integer_options) => integer_options.get_fastfield_cardinality(), + FieldType::Date(ref options) => options.get_fastfield_cardinality(), _ => None, } } fn as_u64(&self) -> u64 { - self.into_unix_timestamp().as_u64() + self.into_timestamp_micros().as_u64() } fn to_type() -> Type { @@ -261,9 +266,9 @@ mod tests { use super::*; use crate::directory::{CompositeFile, Directory, RamDirectory, WritePtr}; use crate::merge_policy::NoMergePolicy; - use crate::schema::{Document, Field, NumericOptions, Schema, FAST, STRING, TEXT}; + use crate::schema::{Document, Field, Schema, FAST, STRING, TEXT}; use crate::time::OffsetDateTime; - use crate::{Index, SegmentId, SegmentReader}; + use crate::{DateOptions, DatePrecision, Index, SegmentId, SegmentReader}; pub static SCHEMA: Lazy = Lazy::new(|| { let mut schema_builder = Schema::builder(); @@ -559,8 +564,8 @@ mod tests { } #[test] - fn test_default_datetime() { - assert_eq!(0, DateTime::make_zero().into_unix_timestamp()); + fn test_default_date() { + assert_eq!(0, DateTime::make_zero().into_timestamp_secs()); } fn get_vals_for_docs(ff: &MultiValuedFastFieldReader, docs: Range) -> Vec { @@ -766,10 +771,15 @@ mod tests { fn test_datefastfield() -> crate::Result<()> { use crate::fastfield::FastValue; let mut schema_builder = Schema::builder(); - let date_field = schema_builder.add_date_field("date", FAST); + let date_field = schema_builder.add_date_field( + "date", + DateOptions::from(FAST).set_precision(DatePrecision::Microseconds), + ); let multi_date_field = schema_builder.add_date_field( "multi_date", - NumericOptions::default().set_fast(Cardinality::MultiValues), + DateOptions::default() + .set_precision(DatePrecision::Microseconds) + .set_fast(Cardinality::MultiValues), ); let schema = schema_builder.build(); let index = Index::create_in_ram(schema); @@ -797,23 +807,23 @@ mod tests { let dates_fast_field = fast_fields.dates(multi_date_field).unwrap(); let mut dates = vec![]; { - assert_eq!(date_fast_field.get(0u32).into_unix_timestamp(), 1i64); + assert_eq!(date_fast_field.get(0u32).into_timestamp_micros(), 1i64); dates_fast_field.get_vals(0u32, &mut dates); assert_eq!(dates.len(), 2); - assert_eq!(dates[0].into_unix_timestamp(), 2i64); - assert_eq!(dates[1].into_unix_timestamp(), 3i64); + assert_eq!(dates[0].into_timestamp_micros(), 2i64); + assert_eq!(dates[1].into_timestamp_micros(), 3i64); } { - assert_eq!(date_fast_field.get(1u32).into_unix_timestamp(), 4i64); + assert_eq!(date_fast_field.get(1u32).into_timestamp_micros(), 4i64); dates_fast_field.get_vals(1u32, &mut dates); assert!(dates.is_empty()); } { - assert_eq!(date_fast_field.get(2u32).into_unix_timestamp(), 0i64); + assert_eq!(date_fast_field.get(2u32).into_timestamp_micros(), 0i64); dates_fast_field.get_vals(2u32, &mut dates); assert_eq!(dates.len(), 2); - assert_eq!(dates[0].into_unix_timestamp(), 5i64); - assert_eq!(dates[1].into_unix_timestamp(), 6i64); + assert_eq!(dates[0].into_timestamp_micros(), 5i64); + assert_eq!(dates[1].into_timestamp_micros(), 6i64); } Ok(()) } diff --git a/src/fastfield/multivalued/mod.rs b/src/fastfield/multivalued/mod.rs index 172f758e0e..69870d0324 100644 --- a/src/fastfield/multivalued/mod.rs +++ b/src/fastfield/multivalued/mod.rs @@ -13,7 +13,7 @@ mod tests { use crate::collector::TopDocs; use crate::indexer::NoMergePolicy; use crate::query::QueryParser; - use crate::schema::{Cardinality, Facet, FacetOptions, NumericOptions, Schema}; + use crate::schema::{Cardinality, DateOptions, Facet, FacetOptions, NumericOptions, Schema}; use crate::time::format_description::well_known::Rfc3339; use crate::time::{Duration, OffsetDateTime}; use crate::{DateTime, Document, Index, Term}; @@ -58,7 +58,7 @@ mod tests { let mut schema_builder = Schema::builder(); let date_field = schema_builder.add_date_field( "multi_date_field", - NumericOptions::default() + DateOptions::default() .set_fast(Cardinality::MultiValues) .set_indexed() .set_fieldnorm() diff --git a/src/fastfield/multivalued/writer.rs b/src/fastfield/multivalued/writer.rs index 7adbfb0bbe..7b976e3273 100644 --- a/src/fastfield/multivalued/writer.rs +++ b/src/fastfield/multivalued/writer.rs @@ -4,12 +4,12 @@ use fnv::FnvHashMap; use tantivy_bitpacker::minmax; use crate::fastfield::serializer::BitpackedFastFieldSerializerLegacy; -use crate::fastfield::{value_to_u64, CompositeFastFieldSerializer, FastFieldType}; +use crate::fastfield::{value_to_u64, CompositeFastFieldSerializer, FastFieldType, FastValue}; use crate::indexer::doc_id_mapping::DocIdMapping; use crate::postings::UnorderedTermId; -use crate::schema::{Document, Field}; +use crate::schema::{Document, Field, Value}; use crate::termdict::TermOrdinal; -use crate::DocId; +use crate::{DatePrecision, DocId}; /// Writer for multi-valued (as in, more than one value per document) /// int fast field. @@ -36,6 +36,7 @@ use crate::DocId; /// term ids when the segment is getting serialized. pub struct MultiValuedFastFieldWriter { field: Field, + precision_opt: Option, vals: Vec, doc_index: Vec, fast_field_type: FastFieldType, @@ -43,9 +44,14 @@ pub struct MultiValuedFastFieldWriter { impl MultiValuedFastFieldWriter { /// Creates a new `MultiValuedFastFieldWriter` - pub(crate) fn new(field: Field, fast_field_type: FastFieldType) -> Self { + pub(crate) fn new( + field: Field, + fast_field_type: FastFieldType, + precision_opt: Option, + ) -> Self { MultiValuedFastFieldWriter { field, + precision_opt, vals: Vec::new(), doc_index: Vec::new(), fast_field_type, @@ -83,7 +89,14 @@ impl MultiValuedFastFieldWriter { } for field_value in doc.field_values() { if field_value.field == self.field { - self.add_val(value_to_u64(field_value.value())); + let value = field_value.value(); + let value_u64 = match (self.precision_opt, value) { + (Some(precision), Value::Date(date_val)) => { + date_val.truncate(precision).to_u64() + } + _ => value_to_u64(value), + }; + self.add_val(value_u64); } } } diff --git a/src/fastfield/writer.rs b/src/fastfield/writer.rs index c0e9b6982b..4d1b5d3467 100644 --- a/src/fastfield/writer.rs +++ b/src/fastfield/writer.rs @@ -7,12 +7,13 @@ use tantivy_bitpacker::BlockedBitpacker; use super::multivalued::MultiValuedFastFieldWriter; use super::serializer::FastFieldStats; -use super::{FastFieldDataAccess, FastFieldType}; +use super::{FastFieldDataAccess, FastFieldType, FastValue}; use crate::fastfield::{BytesFastFieldWriter, CompositeFastFieldSerializer}; use crate::indexer::doc_id_mapping::DocIdMapping; use crate::postings::UnorderedTermId; -use crate::schema::{Cardinality, Document, Field, FieldEntry, FieldType, Schema}; +use crate::schema::{Cardinality, Document, Field, FieldEntry, FieldType, Schema, Value}; use crate::termdict::TermOrdinal; +use crate::DatePrecision; /// The `FastFieldsWriter` groups all of the fast field writers. pub struct FastFieldsWriter { @@ -43,31 +44,51 @@ impl FastFieldsWriter { FieldType::I64(ref int_options) | FieldType::U64(ref int_options) | FieldType::F64(ref int_options) - | FieldType::Bool(ref int_options) - | FieldType::Date(ref int_options) => { + | FieldType::Bool(ref int_options) => { match int_options.get_fastfield_cardinality() { Some(Cardinality::SingleValue) => { - let mut fast_field_writer = IntFastFieldWriter::new(field); + let mut fast_field_writer = IntFastFieldWriter::new(field, None); let default_value = fast_field_default_value(field_entry); fast_field_writer.set_val_if_missing(default_value); single_value_writers.push(fast_field_writer); } Some(Cardinality::MultiValues) => { - let fast_field_writer = - MultiValuedFastFieldWriter::new(field, FastFieldType::Numeric); + let fast_field_writer = MultiValuedFastFieldWriter::new( + field, + FastFieldType::Numeric, + None, + ); multi_values_writers.push(fast_field_writer); } None => {} } } + FieldType::Date(ref options) => match options.get_fastfield_cardinality() { + Some(Cardinality::SingleValue) => { + let mut fast_field_writer = + IntFastFieldWriter::new(field, Some(options.get_precision())); + let default_value = fast_field_default_value(field_entry); + fast_field_writer.set_val_if_missing(default_value); + single_value_writers.push(fast_field_writer); + } + Some(Cardinality::MultiValues) => { + let fast_field_writer = MultiValuedFastFieldWriter::new( + field, + FastFieldType::Numeric, + Some(options.get_precision()), + ); + multi_values_writers.push(fast_field_writer); + } + None => {} + }, FieldType::Facet(_) => { let fast_field_writer = - MultiValuedFastFieldWriter::new(field, FastFieldType::Facet); + MultiValuedFastFieldWriter::new(field, FastFieldType::Facet, None); term_id_writers.push(fast_field_writer); } FieldType::Str(_) if field_entry.is_fast() => { let fast_field_writer = - MultiValuedFastFieldWriter::new(field, FastFieldType::String); + MultiValuedFastFieldWriter::new(field, FastFieldType::String, None); term_id_writers.push(fast_field_writer); } FieldType::Bytes(bytes_option) => { @@ -230,6 +251,7 @@ impl FastFieldsWriter { /// using `common::i64_to_u64` and `common::f64_to_u64`. pub struct IntFastFieldWriter { field: Field, + precision_opt: Option, vals: BlockedBitpacker, val_count: usize, val_if_missing: u64, @@ -239,9 +261,10 @@ pub struct IntFastFieldWriter { impl IntFastFieldWriter { /// Creates a new `IntFastFieldWriter` - pub fn new(field: Field) -> IntFastFieldWriter { + pub fn new(field: Field, precision_opt: Option) -> IntFastFieldWriter { IntFastFieldWriter { field, + precision_opt, vals: BlockedBitpacker::new(), val_count: 0, val_if_missing: 0u64, @@ -305,7 +328,13 @@ impl IntFastFieldWriter { pub fn add_document(&mut self, doc: &Document) { match doc.get_first(self.field) { Some(v) => { - self.add_val(super::value_to_u64(v)); + let value = match (self.precision_opt, v) { + (Some(precision), Value::Date(date_val)) => { + date_val.truncate(precision).to_u64() + } + _ => super::value_to_u64(v), + }; + self.add_val(value); } None => { self.add_val(self.val_if_missing); diff --git a/src/indexer/json_term_writer.rs b/src/indexer/json_term_writer.rs index 501be0cfc1..d3d6478c5a 100644 --- a/src/indexer/json_term_writer.rs +++ b/src/indexer/json_term_writer.rs @@ -8,7 +8,7 @@ use crate::schema::{Field, Type}; use crate::time::format_description::well_known::Rfc3339; use crate::time::{OffsetDateTime, UtcOffset}; use crate::tokenizer::TextAnalyzer; -use crate::{DateTime, DocId, Term}; +use crate::{DatePrecision, DateTime, DocId, Term}; /// This object is a map storing the last position for a given path for the current document /// being indexed. @@ -323,9 +323,16 @@ impl<'a> JsonTermWriter<'a> { pub fn set_fast_value(&mut self, val: T) { self.close_path_and_set_type(T::to_type()); + let value = if T::to_type() == Type::Date { + DateTime::from_u64(val.to_u64()) + .truncate(DatePrecision::Seconds) + .to_u64() + } else { + val.to_u64() + }; self.term_buffer .as_mut() - .extend_from_slice(val.to_u64().to_be_bytes().as_slice()); + .extend_from_slice(value.to_be_bytes().as_slice()); } #[cfg(test)] diff --git a/src/indexer/merger.rs b/src/indexer/merger.rs index 340ca9127b..2ac6ec339b 100644 --- a/src/indexer/merger.rs +++ b/src/indexer/merger.rs @@ -298,8 +298,16 @@ impl IndexMerger { FieldType::U64(ref options) | FieldType::I64(ref options) | FieldType::F64(ref options) - | FieldType::Bool(ref options) - | FieldType::Date(ref options) => match options.get_fastfield_cardinality() { + | FieldType::Bool(ref options) => match options.get_fastfield_cardinality() { + Some(Cardinality::SingleValue) => { + self.write_single_fast_field(field, fast_field_serializer, doc_id_mapping)?; + } + Some(Cardinality::MultiValues) => { + self.write_multi_fast_field(field, fast_field_serializer, doc_id_mapping)?; + } + None => {} + }, + FieldType::Date(ref options) => match options.get_fastfield_cardinality() { Some(Cardinality::SingleValue) => { self.write_single_fast_field(field, fast_field_serializer, doc_id_mapping)?; } diff --git a/src/indexer/segment_writer.rs b/src/indexer/segment_writer.rs index 727eea9537..733bc9a089 100644 --- a/src/indexer/segment_writer.rs +++ b/src/indexer/segment_writer.rs @@ -14,7 +14,7 @@ use crate::store::{StoreReader, StoreWriter}; use crate::tokenizer::{ BoxTokenStream, FacetTokenizer, PreTokenizedStream, TextAnalyzer, Tokenizer, }; -use crate::{DocId, Document, Opstamp, SegmentComponent}; +use crate::{DatePrecision, DocId, Document, Opstamp, SegmentComponent}; /// Computes the initial size of the hash table. /// @@ -248,7 +248,7 @@ impl SegmentWriter { FieldType::Date(_) => { for value in values { let date_val = value.as_date().ok_or_else(make_schema_error)?; - term_buffer.set_u64(date_val.to_u64()); + term_buffer.set_u64(date_val.truncate(DatePrecision::Seconds).to_u64()); postings_writer.subscribe(doc_id, 0u32, term_buffer, ctx); } } diff --git a/src/lib.rs b/src/lib.rs index 1490709de6..355cea54ac 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -133,7 +133,7 @@ pub use time; use crate::time::format_description::well_known::Rfc3339; use crate::time::{OffsetDateTime, PrimitiveDateTime, UtcOffset}; -/// A date/time value with second precision. +/// A date/time value with microsecond precision. /// /// This timestamp does not carry any explicit time zone information. /// Users are responsible for applying the provided conversion @@ -145,13 +145,30 @@ use crate::time::{OffsetDateTime, PrimitiveDateTime, UtcOffset}; /// to prevent unintended usage. #[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord)] pub struct DateTime { - unix_timestamp: i64, + // Timestamp in microseconds. + pub(crate) timestamp_micros: i64, } impl DateTime { - /// Create new from UNIX timestamp - pub const fn from_unix_timestamp(unix_timestamp: i64) -> Self { - Self { unix_timestamp } + /// Create new from UNIX timestamp in seconds + pub const fn from_timestamp_secs(seconds: i64) -> Self { + Self { + timestamp_micros: seconds * 1_000_000, + } + } + + /// Create new from UNIX timestamp in milliseconds + pub const fn from_timestamp_millis(milliseconds: i64) -> Self { + Self { + timestamp_micros: milliseconds * 1_000, + } + } + + /// Create new from UNIX timestamp in microseconds. + pub const fn from_timestamp_micros(microseconds: i64) -> Self { + Self { + timestamp_micros: microseconds, + } } /// Create new from `OffsetDateTime` @@ -159,7 +176,8 @@ impl DateTime { /// The given date/time is converted to UTC and the actual /// time zone is discarded. pub const fn from_utc(dt: OffsetDateTime) -> Self { - Self::from_unix_timestamp(dt.unix_timestamp()) + let timestamp_micros = dt.unix_timestamp() as i64 * 1_000_000 + dt.microsecond() as i64; + Self { timestamp_micros } } /// Create new from `PrimitiveDateTime` @@ -167,21 +185,30 @@ impl DateTime { /// Implicitly assumes that the given date/time is in UTC! /// Otherwise the original value must only be reobtained with /// [`Self::into_primitive()`]. - pub const fn from_primitive(dt: PrimitiveDateTime) -> Self { + pub fn from_primitive(dt: PrimitiveDateTime) -> Self { Self::from_utc(dt.assume_utc()) } - /// Convert to UNIX timestamp - pub const fn into_unix_timestamp(self) -> i64 { - let Self { unix_timestamp } = self; - unix_timestamp + /// Convert to UNIX timestamp in seconds. + pub const fn into_timestamp_secs(self) -> i64 { + self.timestamp_micros / 1_000_000 + } + + /// Convert to UNIX timestamp in milliseconds. + pub const fn into_timestamp_millis(self) -> i64 { + self.timestamp_micros / 1_000 + } + + /// Convert to UNIX timestamp in microseconds. + pub const fn into_timestamp_micros(self) -> i64 { + self.timestamp_micros } /// Convert to UTC `OffsetDateTime` pub fn into_utc(self) -> OffsetDateTime { - let Self { unix_timestamp } = self; - let utc_datetime = - OffsetDateTime::from_unix_timestamp(unix_timestamp).expect("valid UNIX timestamp"); + let timestamp_nanos = self.timestamp_micros as i128 * 1000; + let utc_datetime = OffsetDateTime::from_unix_timestamp_nanos(timestamp_nanos) + .expect("valid UNIX timestamp"); debug_assert_eq!(UtcOffset::UTC, utc_datetime.offset()); utc_datetime } @@ -201,6 +228,18 @@ impl DateTime { debug_assert_eq!(UtcOffset::UTC, utc_datetime.offset()); PrimitiveDateTime::new(utc_datetime.date(), utc_datetime.time()) } + + /// Truncates the microseconds value to the corresponding precision. + pub(crate) fn truncate(self, precision: DatePrecision) -> Self { + let truncated_timestamp_micros = match precision { + DatePrecision::Seconds => (self.timestamp_micros / 1_000_000) * 1_000_000, + DatePrecision::Milliseconds => (self.timestamp_micros / 1_000) * 1_000, + DatePrecision::Microseconds => self.timestamp_micros, + }; + Self { + timestamp_micros: truncated_timestamp_micros, + } + } } impl fmt::Debug for DateTime { @@ -269,7 +308,7 @@ pub use crate::indexer::operation::UserOperation; pub use crate::indexer::{merge_filtered_segments, merge_indices, IndexWriter, PreparedCommit}; pub use crate::postings::Postings; pub use crate::reader::LeasedItem; -pub use crate::schema::{Document, Term}; +pub use crate::schema::{DateOptions, DatePrecision, Document, Term}; /// Index format version. const INDEX_FORMAT_VERSION: u32 = 4; @@ -385,6 +424,7 @@ pub mod tests { use rand::distributions::{Bernoulli, Uniform}; use rand::rngs::StdRng; use rand::{Rng, SeedableRng}; + use time::OffsetDateTime; use crate::collector::tests::TEST_COLLECTOR_WITH_SCORE; use crate::core::SegmentReader; @@ -393,7 +433,7 @@ pub mod tests { use crate::merge_policy::NoMergePolicy; use crate::query::BooleanQuery; use crate::schema::*; - use crate::{DocAddress, Index, Postings, ReloadPolicy}; + use crate::{DateTime, DocAddress, Index, Postings, ReloadPolicy}; pub fn fixed_size_test() { let mut buffer = Vec::new(); @@ -1102,4 +1142,35 @@ pub mod tests { assert!(index.validate_checksum()?.is_empty()); Ok(()) } + + #[test] + fn test_datetime() { + let now = OffsetDateTime::now_utc(); + + let dt = DateTime::from_utc(now).into_utc(); + assert_eq!(dt.to_ordinal_date(), now.to_ordinal_date()); + assert_eq!(dt.to_hms_micro(), now.to_hms_micro()); + // We don't store nanosecond level precision. + assert_ne!(dt.to_hms_nano(), now.to_hms_nano()); + + let dt = DateTime::from_timestamp_secs(now.unix_timestamp()).into_utc(); + assert_eq!(dt.to_ordinal_date(), now.to_ordinal_date()); + assert_eq!(dt.to_hms(), now.to_hms()); + // Constructed from a second precision. + assert_ne!(dt.to_hms_micro(), now.to_hms_micro()); + + let dt = + DateTime::from_timestamp_micros((now.unix_timestamp_nanos() / 1_000) as i64).into_utc(); + assert_eq!(dt.to_ordinal_date(), now.to_ordinal_date()); + assert_eq!(dt.to_hms_micro(), now.to_hms_micro()); + + let dt_from_ts_nanos = + OffsetDateTime::from_unix_timestamp_nanos(18446744073709551615i128).unwrap(); + let offset_dt = DateTime::from_utc(dt_from_ts_nanos).into_utc(); + assert_eq!( + dt_from_ts_nanos.to_ordinal_date(), + offset_dt.to_ordinal_date() + ); + assert_eq!(dt_from_ts_nanos.to_hms_micro(), offset_dt.to_hms_micro()); + } } diff --git a/src/query/more_like_this/more_like_this.rs b/src/query/more_like_this/more_like_this.rs index 737e9ea435..299d38ae1a 100644 --- a/src/query/more_like_this/more_like_this.rs +++ b/src/query/more_like_this/more_like_this.rs @@ -243,13 +243,12 @@ impl MoreLikeThis { } FieldType::Date(_) => { for value in values { - // TODO: Ask if this is the semantic (timestamp) we want - let unix_timestamp = value + let timestamp_micros = value .as_date() .ok_or_else(|| TantivyError::InvalidArgument("invalid value".to_string()))? - .into_unix_timestamp(); - if !self.is_noise_word(unix_timestamp.to_string()) { - let term = Term::from_field_i64(field, unix_timestamp); + .into_timestamp_micros(); + if !self.is_noise_word(timestamp_micros.to_string()) { + let term = Term::from_field_i64(field, timestamp_micros); *term_frequencies.entry(term).or_insert(0) += 1; } } diff --git a/src/query/query_parser/query_parser.rs b/src/query/query_parser/query_parser.rs index 5241a699d6..74eacc0ef3 100644 --- a/src/query/query_parser/query_parser.rs +++ b/src/query/query_parser/query_parser.rs @@ -1068,7 +1068,6 @@ mod test { #[test] fn test_json_field_possibly_a_date() { - // Subseconds are discarded test_parse_query_to_logical_ast_helper( r#"json.date:"2019-10-12T07:20:50.52Z""#, r#"(Term(type=Json, field=14, path=date, vtype=Date, 2019-10-12T07:20:50Z) "[(0, Term(type=Json, field=14, path=date, vtype=Str, "2019")), (1, Term(type=Json, field=14, path=date, vtype=Str, "10")), (2, Term(type=Json, field=14, path=date, vtype=Str, "12t07")), (3, Term(type=Json, field=14, path=date, vtype=Str, "20")), (4, Term(type=Json, field=14, path=date, vtype=Str, "50")), (5, Term(type=Json, field=14, path=date, vtype=Str, "52z"))]")"#, @@ -1352,9 +1351,16 @@ mod test { query_parser.parse_query("date:18a"), Err(QueryParserError::DateFormatError(_)) ); - assert!(query_parser - .parse_query("date:\"1985-04-12T23:20:50.52Z\"") - .is_ok()); + test_parse_query_to_logical_ast_helper( + r#"date:"2010-11-21T09:55:06.000000000+02:00""#, + r#"Term(type=Date, field=9, 2010-11-21T07:55:06Z)"#, + true, + ); + test_parse_query_to_logical_ast_helper( + r#"date:"1985-04-12T23:20:50.52Z""#, + r#"Term(type=Date, field=9, 1985-04-12T23:20:50Z)"#, + true, + ); } #[test] diff --git a/src/schema/date_time_options.rs b/src/schema/date_time_options.rs new file mode 100644 index 0000000000..0f6eea978e --- /dev/null +++ b/src/schema/date_time_options.rs @@ -0,0 +1,276 @@ +use std::ops::BitOr; + +use serde::{Deserialize, Serialize}; + +use super::Cardinality; +use crate::schema::flags::{FastFlag, IndexedFlag, SchemaFlagList, StoredFlag}; + +/// DateTime Precision +#[derive(Clone, Copy, Debug, Hash, PartialEq, Eq, PartialOrd, Ord, Serialize, Deserialize)] +#[serde(rename_all = "lowercase")] +pub enum DatePrecision { + /// Seconds precision + Seconds, + /// Milli-seconds precision. + Milliseconds, + /// Micro-seconds precision. + Microseconds, +} + +impl Default for DatePrecision { + fn default() -> Self { + DatePrecision::Seconds + } +} + +/// Defines how DateTime field should be handled by tantivy. +#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize, Default)] +pub struct DateOptions { + indexed: bool, + // This boolean has no effect if the field is not marked as indexed true. + fieldnorms: bool, + #[serde(skip_serializing_if = "Option::is_none")] + fast: Option, + stored: bool, + // Internal storage precision, used to optimize storage + // compression on fast fields. + #[serde(default)] + precision: DatePrecision, +} + +impl DateOptions { + /// Returns true iff the value is stored. + pub fn is_stored(&self) -> bool { + self.stored + } + + /// Returns true iff the value is indexed and therefore searchable. + pub fn is_indexed(&self) -> bool { + self.indexed + } + + /// Returns true iff the field has fieldnorm. + pub fn fieldnorms(&self) -> bool { + self.fieldnorms && self.indexed + } + + /// Returns true iff the value is a fast field and multivalue. + pub fn is_multivalue_fast(&self) -> bool { + if let Some(cardinality) = self.fast { + cardinality == Cardinality::MultiValues + } else { + false + } + } + + /// Returns true iff the value is a fast field. + pub fn is_fast(&self) -> bool { + self.fast.is_some() + } + + /// Set the field as stored. + /// + /// Only the fields that are set as *stored* are + /// persisted into the Tantivy's store. + #[must_use] + pub fn set_stored(mut self) -> DateOptions { + self.stored = true; + self + } + + /// Set the field as indexed. + /// + /// Setting an integer as indexed will generate + /// a posting list for each value taken by the integer. + /// + /// This is required for the field to be searchable. + #[must_use] + pub fn set_indexed(mut self) -> DateOptions { + self.indexed = true; + self + } + + /// Set the field with fieldnorm. + /// + /// Setting an integer as fieldnorm will generate + /// the fieldnorm data for it. + #[must_use] + pub fn set_fieldnorm(mut self) -> DateOptions { + self.fieldnorms = true; + self + } + + /// Set the field as a single-valued fast field. + /// + /// Fast fields are designed for random access. + /// Access time are similar to a random lookup in an array. + /// If more than one value is associated to a fast field, only the last one is + /// kept. + #[must_use] + pub fn set_fast(mut self, cardinality: Cardinality) -> DateOptions { + self.fast = Some(cardinality); + self + } + + /// Returns the cardinality of the fastfield. + /// + /// If the field has not been declared as a fastfield, then + /// the method returns None. + pub fn get_fastfield_cardinality(&self) -> Option { + self.fast + } + + /// Sets the precision for this DateTime field. + /// + /// Internal storage precision, used to optimize storage + /// compression on fast fields. + pub fn set_precision(mut self, precision: DatePrecision) -> DateOptions { + self.precision = precision; + self + } + + /// Returns the storage precision for this DateTime field. + /// + /// Internal storage precision, used to optimize storage + /// compression on fast fields. + pub fn get_precision(&self) -> DatePrecision { + self.precision + } +} + +impl From<()> for DateOptions { + fn from(_: ()) -> DateOptions { + DateOptions::default() + } +} + +impl From for DateOptions { + fn from(_: FastFlag) -> Self { + DateOptions { + indexed: false, + fieldnorms: false, + stored: false, + fast: Some(Cardinality::SingleValue), + ..Default::default() + } + } +} + +impl From for DateOptions { + fn from(_: StoredFlag) -> Self { + DateOptions { + indexed: false, + fieldnorms: false, + stored: true, + fast: None, + ..Default::default() + } + } +} + +impl From for DateOptions { + fn from(_: IndexedFlag) -> Self { + DateOptions { + indexed: true, + fieldnorms: true, + stored: false, + fast: None, + ..Default::default() + } + } +} + +impl> BitOr for DateOptions { + type Output = DateOptions; + + fn bitor(self, other: T) -> DateOptions { + let other = other.into(); + DateOptions { + indexed: self.indexed | other.indexed, + fieldnorms: self.fieldnorms | other.fieldnorms, + stored: self.stored | other.stored, + fast: self.fast.or(other.fast), + precision: self.precision, + } + } +} + +impl From> for DateOptions +where + Head: Clone, + Tail: Clone, + Self: BitOr + From + From, +{ + fn from(head_tail: SchemaFlagList) -> Self { + Self::from(head_tail.head) | Self::from(head_tail.tail) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_date_options_consistent_with_default() { + let date_time_options: DateOptions = serde_json::from_str( + r#"{ + "indexed": false, + "fieldnorms": false, + "stored": false + }"#, + ) + .unwrap(); + assert_eq!(date_time_options, DateOptions::default()); + } + + #[test] + fn test_serialize_date_option() { + let date_options = serde_json::from_str::( + r#" + { + "indexed": true, + "fieldnorms": false, + "stored": false, + "precision": "milliseconds" + }"#, + ) + .unwrap(); + + let date_options_json = serde_json::to_value(&date_options).unwrap(); + assert_eq!( + date_options_json, + serde_json::json!({ + "precision": "milliseconds", + "indexed": true, + "fieldnorms": false, + "stored": false + }) + ); + } + + #[test] + fn test_deserialize_date_options_with_wrong_options() { + assert!(serde_json::from_str::( + r#"{ + "indexed": true, + "fieldnorms": false, + "stored": "wrong_value" + }"# + ) + .unwrap_err() + .to_string() + .contains("expected a boolean")); + + assert!(serde_json::from_str::( + r#"{ + "indexed": true, + "fieldnorms": false, + "stored": false, + "precision": "hours" + }"# + ) + .unwrap_err() + .to_string() + .contains("unknown variant `hours`")); + } +} diff --git a/src/schema/field_entry.rs b/src/schema/field_entry.rs index 78b5891f37..997fbd2564 100644 --- a/src/schema/field_entry.rs +++ b/src/schema/field_entry.rs @@ -2,7 +2,8 @@ use serde::{Deserialize, Serialize}; use crate::schema::bytes_options::BytesOptions; use crate::schema::{ - is_valid_field_name, FacetOptions, FieldType, JsonObjectOptions, NumericOptions, TextOptions, + is_valid_field_name, DateOptions, FacetOptions, FieldType, JsonObjectOptions, NumericOptions, + TextOptions, }; /// A `FieldEntry` represents a field and its configuration. @@ -55,7 +56,7 @@ impl FieldEntry { } /// Creates a new date field entry. - pub fn new_date(field_name: String, date_options: NumericOptions) -> FieldEntry { + pub fn new_date(field_name: String, date_options: DateOptions) -> FieldEntry { Self::new(field_name, FieldType::Date(date_options)) } @@ -107,8 +108,8 @@ impl FieldEntry { FieldType::U64(ref options) | FieldType::I64(ref options) | FieldType::F64(ref options) - | FieldType::Date(ref options) | FieldType::Bool(ref options) => options.is_stored(), + FieldType::Date(ref options) => options.is_stored(), FieldType::Str(ref options) => options.is_stored(), FieldType::Facet(ref options) => options.is_stored(), FieldType::Bytes(ref options) => options.is_stored(), diff --git a/src/schema/field_type.rs b/src/schema/field_type.rs index 1080d83fc1..c29c695543 100644 --- a/src/schema/field_type.rs +++ b/src/schema/field_type.rs @@ -5,8 +5,8 @@ use thiserror::Error; use crate::schema::bytes_options::BytesOptions; use crate::schema::facet_options::FacetOptions; use crate::schema::{ - Facet, IndexRecordOption, JsonObjectOptions, NumericOptions, TextFieldIndexing, TextOptions, - Value, + DateOptions, Facet, IndexRecordOption, JsonObjectOptions, NumericOptions, TextFieldIndexing, + TextOptions, Value, }; use crate::time::format_description::well_known::Rfc3339; use crate::time::OffsetDateTime; @@ -27,6 +27,11 @@ pub enum ValueParsingError { expected: &'static str, json: serde_json::Value, }, + #[error("Parse error on {json}: {error}")] + ParseError { + error: String, + json: serde_json::Value, + }, #[error("Invalid base64: {base64}")] InvalidBase64 { base64: String }, } @@ -133,7 +138,7 @@ pub enum FieldType { /// Bool field type configuration Bool(NumericOptions), /// Signed 64-bits Date 64 field type configuration, - Date(NumericOptions), + Date(DateOptions), /// Hierachical Facet Facet(FacetOptions), /// Bytes (one per document) @@ -202,8 +207,8 @@ impl FieldType { FieldType::U64(ref int_options) | FieldType::I64(ref int_options) | FieldType::F64(ref int_options) - | FieldType::Date(ref int_options) - | FieldType::Bool(ref int_options) => int_options.get_fastfield_cardinality().is_some(), + | FieldType::Bool(ref int_options) => int_options.is_fast(), + FieldType::Date(ref date_options) => date_options.is_fast(), FieldType::Facet(_) => true, FieldType::JsonObject(_) => false, } @@ -219,8 +224,8 @@ impl FieldType { FieldType::U64(ref int_options) | FieldType::I64(ref int_options) | FieldType::F64(ref int_options) - | FieldType::Date(ref int_options) | FieldType::Bool(ref int_options) => int_options.fieldnorms(), + FieldType::Date(ref date_options) => date_options.fieldnorms(), FieldType::Facet(_) => false, FieldType::Bytes(ref bytes_options) => bytes_options.fieldnorms(), FieldType::JsonObject(ref _json_object_options) => false, @@ -243,7 +248,6 @@ impl FieldType { FieldType::U64(ref int_options) | FieldType::I64(ref int_options) | FieldType::F64(ref int_options) - | FieldType::Date(ref int_options) | FieldType::Bool(ref int_options) => { if int_options.is_indexed() { Some(IndexRecordOption::Basic) @@ -251,6 +255,13 @@ impl FieldType { None } } + FieldType::Date(ref date_options) => { + if date_options.is_indexed() { + Some(IndexRecordOption::Basic) + } else { + None + } + } FieldType::Facet(ref _facet_options) => Some(IndexRecordOption::Basic), FieldType::Bytes(ref bytes_options) => { if bytes_options.is_indexed() { @@ -273,7 +284,7 @@ impl FieldType { pub fn value_from_json(&self, json: JsonValue) -> Result { match json { JsonValue::String(field_text) => { - match *self { + match self { FieldType::Date(_) => { let dt_with_fixed_tz = OffsetDateTime::parse(&field_text, &Rfc3339) .map_err(|_err| ValueParsingError::TypeError { @@ -402,8 +413,8 @@ mod tests { let doc_json = r#"{"date": "2019-10-12T07:20:50.52+02:00"}"#; let doc = schema.parse_document(doc_json).unwrap(); let date = doc.get_first(date_field).unwrap(); - // Time zone is converted to UTC and subseconds are discarded - assert_eq!("Date(2019-10-12T05:20:50Z)", format!("{:?}", date)); + // Time zone is converted to UTC + assert_eq!("Date(2019-10-12T05:20:50.52Z)", format!("{:?}", date)); } #[test] diff --git a/src/schema/flags.rs b/src/schema/flags.rs index 06c12f8ca5..3fa73e7a6e 100644 --- a/src/schema/flags.rs +++ b/src/schema/flags.rs @@ -1,6 +1,7 @@ use std::ops::BitOr; use crate::schema::{NumericOptions, TextOptions}; +use crate::DateOptions; #[derive(Clone)] pub struct StoredFlag; @@ -65,6 +66,14 @@ impl> BitOr for SchemaFlagList> BitOr for SchemaFlagList { + type Output = DateOptions; + + fn bitor(self, rhs: DateOptions) -> Self::Output { + self.head.into() | rhs + } +} + impl> BitOr for SchemaFlagList { type Output = TextOptions; diff --git a/src/schema/mod.rs b/src/schema/mod.rs index b595e0380d..2c145f8be1 100644 --- a/src/schema/mod.rs +++ b/src/schema/mod.rs @@ -117,6 +117,7 @@ mod field_type; mod field_value; mod bytes_options; +mod date_time_options; mod field; mod flags; mod index_record_option; @@ -127,6 +128,7 @@ mod text_options; mod value; pub use self::bytes_options::BytesOptions; +pub use self::date_time_options::{DateOptions, DatePrecision}; pub use self::document::Document; pub(crate) use self::facet::FACET_SEP_BYTE; pub use self::facet::{Facet, FacetParseError}; diff --git a/src/schema/schema.rs b/src/schema/schema.rs index 00aff9f38c..e91a31a625 100644 --- a/src/schema/schema.rs +++ b/src/schema/schema.rs @@ -134,7 +134,7 @@ impl SchemaBuilder { /// by the second one. /// The first field will get a field id /// but only the second one will be indexed - pub fn add_date_field>( + pub fn add_date_field>( &mut self, field_name_str: &str, field_options: T, @@ -813,7 +813,7 @@ mod tests { .set_tokenizer("raw") .set_index_option(IndexRecordOption::Basic), ); - let timestamp_options = NumericOptions::default() + let timestamp_options = DateOptions::default() .set_stored() .set_indexed() .set_fieldnorm() @@ -875,7 +875,8 @@ mod tests { "indexed": true, "fieldnorms": true, "fast": "single", - "stored": true + "stored": true, + "precision": "seconds" } }, { diff --git a/src/schema/term.rs b/src/schema/term.rs index a485ef8960..d9b1912a1b 100644 --- a/src/schema/term.rs +++ b/src/schema/term.rs @@ -5,7 +5,7 @@ use std::{fmt, str}; use super::Field; use crate::fastfield::FastValue; use crate::schema::{Facet, Type}; -use crate::DateTime; +use crate::{DatePrecision, DateTime}; /// Size (in bytes) of the buffer of a fast value (u64, i64, f64, or date) term. /// + + @@ -76,7 +76,7 @@ impl Term { /// Builds a term given a field, and a DateTime value pub fn from_field_date(field: Field, val: DateTime) -> Term { - Term::from_fast_value(field, &val) + Term::from_fast_value(field, &val.truncate(DatePrecision::Seconds)) } /// Creates a `Term` given a facet. diff --git a/src/schema/value.rs b/src/schema/value.rs index d5e4f72cde..5335f88452 100644 --- a/src/schema/value.rs +++ b/src/schema/value.rs @@ -24,7 +24,7 @@ pub enum Value { F64(f64), /// Bool value Bool(bool), - /// Date/time with second precision + /// Date/time with microseconds precision Date(DateTime), /// Facet Facet(Facet), @@ -251,7 +251,7 @@ impl<'a> From<&'a [u8]> for Value { } } -impl<'a> From for Value { +impl From for Value { fn from(facet: Facet) -> Value { Value::Facet(facet) } @@ -348,8 +348,10 @@ mod binary_serialize { } Value::Date(ref val) => { DATE_CODE.serialize(writer)?; - let DateTime { unix_timestamp } = val; - unix_timestamp.serialize(writer) + let DateTime { + timestamp_micros, .. + } = val; + timestamp_micros.serialize(writer) } Value::Facet(ref facet) => { HIERARCHICAL_FACET_CODE.serialize(writer)?; @@ -391,8 +393,10 @@ mod binary_serialize { Ok(Value::Bool(value)) } DATE_CODE => { - let unix_timestamp = i64::deserialize(reader)?; - Ok(Value::Date(DateTime::from_unix_timestamp(unix_timestamp))) + let timestamp_micros = i64::deserialize(reader)?; + Ok(Value::Date(DateTime::from_timestamp_micros( + timestamp_micros, + ))) } HIERARCHICAL_FACET_CODE => Ok(Value::Facet(Facet::deserialize(reader)?)), BYTES_CODE => Ok(Value::Bytes(Vec::::deserialize(reader)?)),