serialize DateTime As Long to improve json serde performance #4038

kaijianding · 2017-03-12T16:32:50Z

It's extremely faster to serialize and deserialize DateTIme as long in the json transportation between broker and historical if user is sure there is no TimeZone problem.
It's very useful when issue a select query or big group by query whose granularity is small like PT10S
serializeDateTimeAsLongInner=true means only serialize DateTime between broker and historical
serializeDateTimeAsLong=true means also serialize DateTIme as long in the result if user need long result instead of string, eg: user wants to display the DateTIme in a different format

The benchmark code:

  private static class TestObject
  {
    private DateTime dateTime;

    @JsonCreator
    public TestObject(@JsonProperty("dateTime") DateTime dateTime)
    {
      this.dateTime = dateTime;
    }

    @JsonProperty
    public DateTime getDateTime()
    {
      return dateTime;
    }
  }
public static class LongAndTimeZoneDateTimeSerializer extends JodaDateSerializerBase<DateTime>
{
  protected final static JacksonJodaFormat DEFAULT_FORMAT
      = new JacksonJodaFormat(ISODateTimeFormat.dateTime().withZoneUTC());

  public LongAndTimeZoneDateTimeSerializer() { this(DEFAULT_FORMAT); }
  public LongAndTimeZoneDateTimeSerializer(JacksonJodaFormat format) {
    super(DateTime.class, format);
  }

  @Override
  public LongAndTimeZoneDateTimeSerializer withFormat(JacksonJodaFormat formatter) {
    return (_format == formatter) ? this : new LongAndTimeZoneDateTimeSerializer(formatter);
  }

  @Override
  public void serialize(DateTime value, JsonGenerator jgen, SerializerProvider provider)
      throws IOException
  {
    if (_useTimestamp(provider)) {
      jgen.writeStartArray(2);
      jgen.writeNumber(value.getMillis());
      jgen.writeNumber(value.getZone().getOffset(value.getMillis()));
      jgen.writeEndArray();
    } else {
      jgen.writeString(_format.createFormatter(provider).print(value));
    }
  }

  @Override
  public JsonNode getSchema(SerializerProvider provider, java.lang.reflect.Type typeHint) {
    return createSchemaNode(_useTimestamp(provider) ? "array" : "string", true);
  }
}
public static class LongAndTimeZoneDateTimeDeserializer extends StdDeserializer<DateTime>
{
  public LongAndTimeZoneDateTimeDeserializer()
  {
    super(DateTime.class);
  }

  @Override
  public DateTime deserialize(JsonParser jp, DeserializationContext ctxt)
      throws IOException
  {
    JsonToken t = jp.getCurrentToken();
    if (t == JsonToken.START_ARRAY) {
      jp.nextToken();
      long timestamp = jp.getLongValue();
      jp.nextToken();
      int timeZoneOffset = jp.getIntValue();
      jp.nextToken();
      return new DateTime(timestamp, DateTimeZone.forOffsetMillis(timeZoneOffset));
    }
    throw ctxt.mappingException(getValueClass());
  }
}
  public static ObjectMapper smileMapper()
  {
    final SmileFactory smileFactory = new SmileFactory();
    smileFactory.configure(SmileGenerator.Feature.ENCODE_BINARY_AS_7BIT, false);
    smileFactory.delegateToTextual(true);
    final ObjectMapper retVal = new DefaultObjectMapper(smileFactory);
    retVal.getFactory().setCodec(retVal);
    return retVal;
  }

  public static void main(String[] args) throws IOException
  {
    ObjectMapper mapper = smileMapper();
    ObjectMapper longMapper = mapper.copy().registerModule(
        new SimpleModule().addSerializer(DateTime.class, new DateTimeSerializer())
                          .addDeserializer(DateTime.class, DateTimeDeserializer.forType(DateTime.class))
    );
    ObjectMapper timeZoneMapper = mapper.copy().registerModule(
        new SimpleModule().addSerializer(DateTime.class, new LongAndTimeZoneDateTimeSerializer())
                          .addDeserializer(DateTime.class, new LongAndTimeZoneDateTimeDeserializer())
    );
    ObjectWriter defaultWriter = mapper.writer();
    ObjectWriter longWriter = longMapper.writer();
    ObjectWriter timeZoneWriter = timeZoneMapper.writer();
    TestObject testObject = new TestObject(new DateTime());
    System.out.println(defaultWriter.writeValueAsString(testObject));
    System.out.println(longWriter.writeValueAsString(testObject));
    System.out.println(timeZoneWriter.writeValueAsString(testObject));

    // default serialize
    long start = System.currentTimeMillis();
    for (int i = 0; i < 10000000; i++) {
      defaultWriter.writeValueAsBytes(testObject);
    }
    System.out.println(System.currentTimeMillis() - start);

    // long serialize
    start = System.currentTimeMillis();
    for (int i = 0; i < 10000000; i++) {
      longWriter.writeValueAsBytes(testObject);
    }
    System.out.println(System.currentTimeMillis() - start);

    // with timeZone serialize
    start = System.currentTimeMillis();
    for (int i = 0; i < 10000000; i++) {
      timeZoneWriter.writeValueAsBytes(testObject);
    }
    System.out.println(System.currentTimeMillis() - start);

    byte[] dateStr = defaultWriter.writeValueAsBytes(testObject);
    byte[] longBytes = longWriter.writeValueAsBytes(testObject);
    byte[] timeZoneBytes = timeZoneWriter.writeValueAsBytes(testObject);

    // default deserialize
    start = System.currentTimeMillis();
    for (int i = 0; i < 10000000; i++) {
      mapper.readValue(dateStr, TestObject.class);
    }
    System.out.println(System.currentTimeMillis() - start);

    // long deserialize
    start = System.currentTimeMillis();
    for (int i = 0; i < 10000000; i++) {
      longMapper.readValue(longBytes, TestObject.class);
    }
    System.out.println(System.currentTimeMillis() - start);

    // timezone deserialize
    start = System.currentTimeMillis();
    for (int i = 0; i < 10000000; i++) {
      timeZoneMapper.readValue(timeZoneBytes, TestObject.class);
    }
    System.out.println(System.currentTimeMillis() - start);
  }

the result is

{"dateTime":"2017-03-14T01:56:05.261+08:00"}
{"dateTime":1489427765261}
{"dateTime":[1489427765261,28800000]}
11398
3345
3752
32455
5200
6803

the improvement applies to both jsonMapper and smileMapper

b-slim · 2017-03-12T20:49:28Z

does it compare to smile encoding, because that what it used by broker/historicals ?

kaijianding · 2017-03-13T02:25:40Z

It works with smile encoding, not a replacement. It is based on jsonMapper and smileMapper. @b-slim

b-slim · 2017-03-13T15:16:39Z

@kaijianding this test is reporting the results for default Json ser/desr, and my question was would that be the same improvement if you use smile mapper new DefaultObjectMapper(new SmileFactory());

kaijianding · 2017-03-13T15:31:28Z

yes, this improvement applies both to jsonMapper and smileMapper. I updated the benchmark code to use smileMapper, please check. @b-slim

gianm · 2017-03-13T16:33:39Z

@kaijianding What is performance like if you serialize DateTimes as a timestamp + timezone offset, using a json array of [long, long]? If it's still good, we could do that without loss of functionality between historicals and brokers, and it could be on by default there.

kaijianding · 2017-03-13T16:53:15Z

usually broker and historical use same timezone, so the current implement is good enough for most cases.
I will try to use [long, long] pair to find out whether it is still good enough and update the benchmark @gianm

gianm · 2017-03-13T17:02:48Z

Hmm, I don't think we need to worry about brokers/historicals being in non-UTC time zones, we have always said this is not a supported configuration.

I was more thinking of supporting cases where a historical would return non-UTC time zones for other reasons, like when the "granularity" is a PeriodGranularity with a non-UTC time zone.

kaijianding · 2017-03-13T18:06:21Z

I tested timeSeries and group by query, the results are correct for non-UTC PeriodGranularity (we are in china timeZone, we use +08:00 PeriodGranularity all the time, the results are as expected), these 2 queries will do granularity.truncate(), so timeZone is not a problem.

I update the benchmark code to test [timestamp, timezone offset] as [long, int] pair. In this format, it is slower than the pure timestamp format, but I think it is still good enough. @gianm

drcrallen · 2017-03-13T18:09:26Z

Can the benchmark be added to https://github.com/druid-io/druid/tree/master/benchmarks in the same style as is used there?

That way the testing is standardized.

kaijianding · 2017-03-13T18:16:47Z

I just write some benchmark code to prove this PR can improve json ser/desr performance, but it is not really a benchmark against current druid code.
I don't think it should be added to the benchmarks code base. @drcrallen

KenjiTakahashi · 2017-03-17T00:15:08Z

That's something nice (again). Would it be possible to get that feature also when talking to historical directly?

kaijianding · 2017-03-17T03:57:53Z

historical also uses QueryResource, so it works when talking to historical as long as you specify the serializeDateTimeAsLong flag in query context. @KenjiTakahashi

KenjiTakahashi · 2017-03-17T16:51:02Z

Oh, it's in the context! I somehow thought it was option directly on the query. Works fine, thanks.
Not as much difference in performance as I kinda expected, I think I'm gonna do some more profiling anyway.

kaijianding · 2017-03-17T17:08:25Z

It reduces 10%~20% time in huge group by query in my production environment. For huge, I mean it returns lots of rows from lots of historicals (36000*32 rows, 2 dimensions in each row, in my environment), thus the json is huge. Currently, broker deserializes json in single thread, so the deserialization on broker becomes bottleneck. @KenjiTakahashi

gianm · 2017-03-17T17:24:20Z

10%~20% sounds about right. In #3740 we benchmarked a roughly 30% improvement for removing timestamps completely (set to null) from the historical -> broker communication when they aren't necessary.

kaijianding · 2017-03-17T17:34:13Z

Should I serialize and deserialize timeZone in this PR? It may gain less improvement. My case uses PeriodGranularity with +08:00 timezone, groupby and timeseries query can handle long timestamp correctly by granularity.truncate(), so there is no need to serialize and deserialize timeZone for these two queries. @gianm

gianm · 2017-03-17T17:41:56Z

@kaijianding hmm, maybe time zone isn't necessary then, if the granularity object on the broker "knows" the proper time zone and can reapply it. It sounds like that's what you're saying, in which case just using the millis should be fine.

Really I'm just pushing to do something we can have on all the time, automatically, rather than something that requires users to set a special context flag. Then a lot more users will get the benefit, since most users just stick with defaults for things like this.

kaijianding · 2017-03-17T17:46:56Z

Ok. Then I will make serializeDateTimeAsLongInner=true as default, and serialize and deserialize timeZone @gianm

KenjiTakahashi · 2017-03-20T01:11:55Z

I've seen about 20% as well. Back when we were profiling the select query, one of things that were coming up quite often were Joda stuff, so I kinda expected a bit more (like ~50% maybe?).

There are multiple problems with JSON as efficient serialization for larger sets of data, maybe at some point we'd be able to introduce something better for this purpose?

gianm · 2017-03-20T16:20:50Z

Yeah, it makes sense to explore options other than JSON. Druid uses JSON since all the original query types (timeBoundary, timeseries, topN, search) return relatively small result sets and so serde overhead of results doesn't matter much. But for groupBy, select, and scan, it does matter.

jon-wei · 2017-05-30T21:24:10Z

@kaijianding

Can you fix conflicts and address the latest comment from @gianm about enabling the long serialization by default?

Can you also verify that leaving out the time zone in the serialization works for queries other than GroupBy and Timeseries?

We're planning on wrapping up the 0.10.1 release this week, this PR seems useful to include if possible.

fjy · 2017-06-02T22:28:02Z

👍

fjy · 2017-06-02T22:29:06Z

docs/content/querying/query-context.md

@@ -18,6 +18,8 @@ The query context is used for various query configuration parameters. The follow
 |bySegment        | `false`                                | Return "by segment" results. Primarily used for debugging, setting it to `true` returns results associated with the data segment they came from |
 |finalize         | `true`                                 | Flag indicating whether to "finalize" aggregation results. Primarily used for debugging. For instance, the `hyperUnique` aggregator will return the full HyperLogLog sketch instead of the estimated cardinality when this flag is set to `false` |
 |chunkPeriod      | `P0D` (off)                            | At the broker node level, long interval queries (of any type) may be broken into shorter interval queries to parallelize merging more than normal. Broken up queries will use a larger share of cluster resources, but may be able to complete faster as a result. Use ISO 8601 periods. For example, if this property is set to `P1M` (one month), then a query covering a year would be broken into 12 smaller queries. The broker uses its query processing executor service to initiate processing for query chunks, so make sure "druid.processing.numThreads" is configured appropriately on the broker. [groupBy queries](groupbyquery.html) do not support chunkPeriod by default, although they do if using the legacy "v1" engine. |
+|serializeDateTimeAsLong| `false`       | If true, DateTime is serialized as long in the result returned by broker and the data transportation between broker and compute node|
+|serializeDateTimeAsLongInner| `false`  | If true, DateTime is serialized as long in the data transportation between broker and compute node|


I would call this serializeDateTimeAsLongInternal

Also if setting this flag always improves performance I would just make the default 'true'

jon-wei · 2017-06-06T17:08:46Z

👍

gianm added this to the 0.10.1 milestone Mar 17, 2017

gianm added Improvement Performance labels Mar 17, 2017

gianm self-requested a review May 16, 2017 17:56

serialize DateTime As Long to improve json serde performance

38be879

kaijianding force-pushed the long branch from a6e91c2 to 38be879 Compare June 2, 2017 02:57

fjy reviewed Jun 2, 2017

View reviewed changes

jon-wei merged commit 551a89b into apache:master Jun 6, 2017

kaijianding deleted the long branch June 6, 2017 18:26

jon-wei mentioned this pull request Jun 13, 2017

Druid 0.10.1 release notes #4384

Closed

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

serialize DateTime As Long to improve json serde performance #4038

serialize DateTime As Long to improve json serde performance #4038

kaijianding commented Mar 12, 2017 •

edited

Loading

b-slim commented Mar 12, 2017

kaijianding commented Mar 13, 2017

b-slim commented Mar 13, 2017

kaijianding commented Mar 13, 2017

gianm commented Mar 13, 2017

kaijianding commented Mar 13, 2017

gianm commented Mar 13, 2017

kaijianding commented Mar 13, 2017

drcrallen commented Mar 13, 2017

kaijianding commented Mar 13, 2017

KenjiTakahashi commented Mar 17, 2017

kaijianding commented Mar 17, 2017

KenjiTakahashi commented Mar 17, 2017

kaijianding commented Mar 17, 2017

gianm commented Mar 17, 2017

kaijianding commented Mar 17, 2017

gianm commented Mar 17, 2017

kaijianding commented Mar 17, 2017

KenjiTakahashi commented Mar 20, 2017

gianm commented Mar 20, 2017 •

edited

Loading

jon-wei commented May 30, 2017

fjy commented Jun 2, 2017

fjy Jun 2, 2017 •

edited

Loading

fjy Jun 2, 2017 •

edited

Loading

jon-wei commented Jun 6, 2017

serialize DateTime As Long to improve json serde performance #4038

serialize DateTime As Long to improve json serde performance #4038

Conversation

kaijianding commented Mar 12, 2017 • edited Loading

b-slim commented Mar 12, 2017

kaijianding commented Mar 13, 2017

b-slim commented Mar 13, 2017

kaijianding commented Mar 13, 2017

gianm commented Mar 13, 2017

kaijianding commented Mar 13, 2017

gianm commented Mar 13, 2017

kaijianding commented Mar 13, 2017

drcrallen commented Mar 13, 2017

kaijianding commented Mar 13, 2017

KenjiTakahashi commented Mar 17, 2017

kaijianding commented Mar 17, 2017

KenjiTakahashi commented Mar 17, 2017

kaijianding commented Mar 17, 2017

gianm commented Mar 17, 2017

kaijianding commented Mar 17, 2017

gianm commented Mar 17, 2017

kaijianding commented Mar 17, 2017

KenjiTakahashi commented Mar 20, 2017

gianm commented Mar 20, 2017 • edited Loading

jon-wei commented May 30, 2017

fjy commented Jun 2, 2017

fjy Jun 2, 2017 • edited Loading

Choose a reason for hiding this comment

fjy Jun 2, 2017 • edited Loading

Choose a reason for hiding this comment

jon-wei commented Jun 6, 2017

kaijianding commented Mar 12, 2017 •

edited

Loading

gianm commented Mar 20, 2017 •

edited

Loading

fjy Jun 2, 2017 •

edited

Loading

fjy Jun 2, 2017 •

edited

Loading