Skip to content

Commit

Permalink
add aggregation support for date type (#1693)
Browse files Browse the repository at this point in the history
* add aggregation support for date type
fixes #1332

* serialize key_as_string as rfc3339 in date histogram
* update docs
* enable date for range aggregation
  • Loading branch information
PSeitz authored Nov 28, 2022
1 parent 600548f commit ee1f2c1
Show file tree
Hide file tree
Showing 12 changed files with 339 additions and 77 deletions.
2 changes: 1 addition & 1 deletion examples/aggregation.rs
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,7 @@ fn main() -> tantivy::Result<()> {
.into_iter()
.collect();

let collector = AggregationCollector::from_aggs(agg_req_1, None);
let collector = AggregationCollector::from_aggs(agg_req_1, None, index.schema());

let searcher = reader.searcher();
let agg_res: AggregationResults = searcher.search(&term_query, &collector).unwrap();
Expand Down
10 changes: 2 additions & 8 deletions src/aggregation/agg_req_with_accessor.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ use super::bucket::{HistogramAggregation, RangeAggregation, TermsAggregation};
use super::metric::{AverageAggregation, StatsAggregation};
use super::segment_agg_result::BucketCount;
use super::VecWithNames;
use crate::fastfield::{type_and_cardinality, FastType, MultiValuedFastFieldReader};
use crate::fastfield::{type_and_cardinality, MultiValuedFastFieldReader};
use crate::schema::{Cardinality, Type};
use crate::{InvertedIndexReader, SegmentReader, TantivyError};

Expand Down Expand Up @@ -194,13 +194,7 @@ fn get_ff_reader_and_validate(
.ok_or_else(|| TantivyError::FieldNotFound(field_name.to_string()))?;
let field_type = reader.schema().get_field_entry(field).field_type();

if let Some((ff_type, field_cardinality)) = type_and_cardinality(field_type) {
if ff_type == FastType::Date {
return Err(TantivyError::InvalidArgument(
"Unsupported field type date in aggregation".to_string(),
));
}

if let Some((_ff_type, field_cardinality)) = type_and_cardinality(field_type) {
if cardinality != field_cardinality {
return Err(TantivyError::InvalidArgument(format!(
"Invalid field cardinality on field {} expected {:?}, but got {:?}",
Expand Down
17 changes: 15 additions & 2 deletions src/aggregation/agg_result.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ use super::bucket::GetDocCount;
use super::intermediate_agg_result::{IntermediateBucketResult, IntermediateMetricResult};
use super::metric::{SingleMetricResult, Stats};
use super::Key;
use crate::schema::Schema;
use crate::TantivyError;

#[derive(Clone, Default, Debug, PartialEq, Serialize, Deserialize)]
Expand Down Expand Up @@ -129,9 +130,12 @@ pub enum BucketResult {
}

impl BucketResult {
pub(crate) fn empty_from_req(req: &BucketAggregationInternal) -> crate::Result<Self> {
pub(crate) fn empty_from_req(
req: &BucketAggregationInternal,
schema: &Schema,
) -> crate::Result<Self> {
let empty_bucket = IntermediateBucketResult::empty_from_req(&req.bucket_agg);
empty_bucket.into_final_bucket_result(req)
empty_bucket.into_final_bucket_result(req, schema)
}
}

Expand Down Expand Up @@ -174,6 +178,9 @@ pub enum BucketEntries<T> {
/// ```
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
pub struct BucketEntry {
#[serde(skip_serializing_if = "Option::is_none")]
/// The string representation of the bucket.
pub key_as_string: Option<String>,
/// The identifier of the bucket.
pub key: Key,
/// Number of documents in the bucket.
Expand Down Expand Up @@ -238,4 +245,10 @@ pub struct RangeBucketEntry {
/// The to range of the bucket. Equals `f64::MAX` when `None`.
#[serde(skip_serializing_if = "Option::is_none")]
pub to: Option<f64>,
/// The optional string representation for the `from` range.
#[serde(skip_serializing_if = "Option::is_none")]
pub from_as_string: Option<String>,
/// The optional string representation for the `to` range.
#[serde(skip_serializing_if = "Option::is_none")]
pub to_as_string: Option<String>,
}
97 changes: 90 additions & 7 deletions src/aggregation/bucket/histogram/histogram.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,12 +10,12 @@ use crate::aggregation::agg_req_with_accessor::{
AggregationsWithAccessor, BucketAggregationWithAccessor,
};
use crate::aggregation::agg_result::BucketEntry;
use crate::aggregation::f64_from_fastfield_u64;
use crate::aggregation::intermediate_agg_result::{
IntermediateAggregationResults, IntermediateBucketResult, IntermediateHistogramBucketEntry,
};
use crate::aggregation::segment_agg_result::SegmentAggregationResultsCollector;
use crate::schema::Type;
use crate::aggregation::{f64_from_fastfield_u64, format_date};
use crate::schema::{Schema, Type};
use crate::{DocId, TantivyError};

/// Histogram is a bucket aggregation, where buckets are created dynamically for given `interval`.
Expand Down Expand Up @@ -451,6 +451,7 @@ fn intermediate_buckets_to_final_buckets_fill_gaps(
buckets: Vec<IntermediateHistogramBucketEntry>,
histogram_req: &HistogramAggregation,
sub_aggregation: &AggregationsInternal,
schema: &Schema,
) -> crate::Result<Vec<BucketEntry>> {
// Generate the full list of buckets without gaps.
//
Expand Down Expand Up @@ -491,7 +492,9 @@ fn intermediate_buckets_to_final_buckets_fill_gaps(
sub_aggregation: empty_sub_aggregation.clone(),
},
})
.map(|intermediate_bucket| intermediate_bucket.into_final_bucket_entry(sub_aggregation))
.map(|intermediate_bucket| {
intermediate_bucket.into_final_bucket_entry(sub_aggregation, schema)
})
.collect::<crate::Result<Vec<_>>>()
}

Expand All @@ -500,20 +503,43 @@ pub(crate) fn intermediate_histogram_buckets_to_final_buckets(
buckets: Vec<IntermediateHistogramBucketEntry>,
histogram_req: &HistogramAggregation,
sub_aggregation: &AggregationsInternal,
schema: &Schema,
) -> crate::Result<Vec<BucketEntry>> {
if histogram_req.min_doc_count() == 0 {
let mut buckets = if histogram_req.min_doc_count() == 0 {
// With min_doc_count != 0, we may need to add buckets, so that there are no
// gaps, since intermediate result does not contain empty buckets (filtered to
// reduce serialization size).

intermediate_buckets_to_final_buckets_fill_gaps(buckets, histogram_req, sub_aggregation)
intermediate_buckets_to_final_buckets_fill_gaps(
buckets,
histogram_req,
sub_aggregation,
schema,
)?
} else {
buckets
.into_iter()
.filter(|histogram_bucket| histogram_bucket.doc_count >= histogram_req.min_doc_count())
.map(|histogram_bucket| histogram_bucket.into_final_bucket_entry(sub_aggregation))
.collect::<crate::Result<Vec<_>>>()
.map(|histogram_bucket| {
histogram_bucket.into_final_bucket_entry(sub_aggregation, schema)
})
.collect::<crate::Result<Vec<_>>>()?
};

// If we have a date type on the histogram buckets, we add the `key_as_string` field as rfc339
let field = schema
.get_field(&histogram_req.field)
.ok_or_else(|| TantivyError::FieldNotFound(histogram_req.field.to_string()))?;
if schema.get_field_entry(field).field_type().is_date() {
for bucket in buckets.iter_mut() {
if let crate::aggregation::Key::F64(val) = bucket.key {
let key_as_string = format_date(val as i64)?;
bucket.key_as_string = Some(key_as_string);
}
}
}

Ok(buckets)
}

/// Applies req extended_bounds/hard_bounds on the min_max value
Expand Down Expand Up @@ -1372,6 +1398,63 @@ mod tests {
Ok(())
}

#[test]
fn histogram_date_test_single_segment() -> crate::Result<()> {
histogram_date_test_with_opt(true)
}

#[test]
fn histogram_date_test_multi_segment() -> crate::Result<()> {
histogram_date_test_with_opt(false)
}

fn histogram_date_test_with_opt(merge_segments: bool) -> crate::Result<()> {
let index = get_test_index_2_segments(merge_segments)?;

let agg_req: Aggregations = vec![(
"histogram".to_string(),
Aggregation::Bucket(BucketAggregation {
bucket_agg: BucketAggregationType::Histogram(HistogramAggregation {
field: "date".to_string(),
interval: 86400000000.0, // one day in microseconds
..Default::default()
}),
sub_aggregation: Default::default(),
}),
)]
.into_iter()
.collect();

let agg_res = exec_request(agg_req, &index)?;

let res: Value = serde_json::from_str(&serde_json::to_string(&agg_res)?)?;

assert_eq!(res["histogram"]["buckets"][0]["key"], 1546300800000000.0);
assert_eq!(
res["histogram"]["buckets"][0]["key_as_string"],
"2019-01-01T00:00:00Z"
);
assert_eq!(res["histogram"]["buckets"][0]["doc_count"], 1);

assert_eq!(res["histogram"]["buckets"][1]["key"], 1546387200000000.0);
assert_eq!(
res["histogram"]["buckets"][1]["key_as_string"],
"2019-01-02T00:00:00Z"
);

assert_eq!(res["histogram"]["buckets"][1]["doc_count"], 5);

assert_eq!(res["histogram"]["buckets"][2]["key"], 1546473600000000.0);
assert_eq!(
res["histogram"]["buckets"][2]["key_as_string"],
"2019-01-03T00:00:00Z"
);

assert_eq!(res["histogram"]["buckets"][3], Value::Null);

Ok(())
}

#[test]
fn histogram_invalid_request() -> crate::Result<()> {
let index = get_test_index_2_segments(true)?;
Expand Down
Loading

0 comments on commit ee1f2c1

Please sign in to comment.