From 2549fe7dc2de5d952eb6043b98cd09aac0a3698e Mon Sep 17 00:00:00 2001 From: Zhichang Yu Date: Tue, 18 May 2021 17:25:04 +0800 Subject: [PATCH] detect DateTime format till success --- docs/dev/introduction.md | 2 +- go.sum | 3 --- parser/fastjson.go | 4 ++-- parser/parser.go | 11 +++++++---- task/task.go | 3 +++ 5 files changed, 13 insertions(+), 10 deletions(-) diff --git a/docs/dev/introduction.md b/docs/dev/introduction.md index f9edc5a2..8a046ff1 100644 --- a/docs/dev/introduction.md +++ b/docs/dev/introduction.md @@ -28,7 +28,7 @@ Refers to [design](./design.md) for how it works. - [x] UInt8, UInt16, UInt32, UInt64, Int8, Int16, Int32, Int64 - [x] Float32, Float64 - [x] String, FixedString, LowCardinality(String) -- [x] Date, DateTime, DateTime64. Automatically detect [these date formats](https://github.com/housepower/clickhouse_sinker/blob/master/parser/parser.go). +- [x] Date, DateTime, DateTime64. Assuming that all values of a field of kafka message has the same layout, and layouts of each field are unrelated. Automatically detect the layout from [these date layouts](https://github.com/housepower/clickhouse_sinker/blob/master/parser/parser.go) till the first successful detection and reuse that layout forever. - [x] Array(T), where T is one of above basic types - [x] Nullable(T), where T is one of above basic types - [x] [ElasticDateTime](https://www.elastic.co/guide/en/elasticsearch/reference/current/date.html) => Int64 (2019-12-16T12:10:30Z => 1576498230) diff --git a/go.sum b/go.sum index 615cc565..4acd052a 100644 --- a/go.sum +++ b/go.sum @@ -328,8 +328,6 @@ github.com/samuel/go-zookeeper v0.0.0-20190923202752-2cc03de413da/go.mod h1:gi+0 github.com/sean-/seed v0.0.0-20170313163322-e2103e2c3529/go.mod h1:DxrIzT+xaE7yg65j358z/aeFdxmN0P9QXhEzd20vsDc= github.com/segmentio/kafka-go v0.4.8 h1:LO36H2tb7RcCRjsYzT/qf7xE+vRBXgddZDD82e1eiWY= github.com/segmentio/kafka-go v0.4.8/go.mod h1:Inh7PqOsxmfgasV8InZYKVXWsdjcCq2d9tFV75GLbuM= -github.com/segmentio/kafka-go v0.4.16 h1:9dt78ehM9qzAkekA60D6A96RlqDzC3hnYYa8y5Szd+U= -github.com/segmentio/kafka-go v0.4.16/go.mod h1:19+Eg7KwrNKy/PFhiIthEPkO8k+ac7/ZYXwYM9Df10w= github.com/shurcooL/sanitized_anchor_name v1.0.0/go.mod h1:1NzhyTcUVG4SuEtjjoZeVRXNmyL/1OwPU0+IJeTBvfc= github.com/sirupsen/logrus v1.2.0/go.mod h1:LxeOpSwHxABJmUn/MG1IvRgCAasNZTLOkJPxbbu5VWo= github.com/sirupsen/logrus v1.4.2/go.mod h1:tLMulIdttU9McNUspp0xgXVQah82FyeX6MwdIuYE2rE= @@ -349,7 +347,6 @@ github.com/streadway/amqp v0.0.0-20200108173154-1c71cc93ed71 h1:2MR0pKUzlP3SGgj5 github.com/streadway/amqp v0.0.0-20200108173154-1c71cc93ed71/go.mod h1:AZpEONHx3DKn8O/DFsRAY58/XVQiIPMTMB1SddzLXVw= github.com/streadway/handy v0.0.0-20190108123426-d5acb3125c2a/go.mod h1:qNTQ5P5JnDBl6z3cMAg/SywNDC5ABu5ApDIw6lUbRmI= github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= -github.com/stretchr/objx v0.1.1 h1:2vfRuCMp5sSVIDSqO8oNnWJq7mPa6KVP3iPIwFBuy8A= github.com/stretchr/objx v0.1.1/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= github.com/stretchr/testify v1.2.2/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs= github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= diff --git a/parser/fastjson.go b/parser/fastjson.go index 2fc4bc88..94cb0e5d 100644 --- a/parser/fastjson.go +++ b/parser/fastjson.go @@ -122,7 +122,7 @@ func (c *FastjsonMetric) GetDateTime(key string, nullable bool) (val interface{} val = UnixFloat(f) case fastjson.TypeString: var b []byte - if b, err = v.StringBytes(); err != nil { + if b, err = v.StringBytes(); err != nil || len(b) == 0 { val = Epoch return } @@ -189,7 +189,7 @@ func (c *FastjsonMetric) GetArray(key string, typ int) (val interface{}) { t = UnixFloat(f) } case fastjson.TypeString: - if b, err := e.StringBytes(); err != nil { + if b, err := e.StringBytes(); err != nil || len(b) == 0 { t = Epoch } else { t = c.pp.ParseDateTime(key, string(b)) diff --git a/parser/parser.go b/parser/parser.go index d401daf1..68f21af7 100644 --- a/parser/parser.go +++ b/parser/parser.go @@ -130,7 +130,8 @@ func (pp *Pool) Put(p Parser) { pp.pool.Put(p) } -// Detect date format for each key at the first message. +// Assuming that all values of a field of kafka message has the same layout, and layouts of each field are unrelated. +// Automatically detect the layout from till the first successful detection and reuse that layout forever. // Return time in UTC. // Return Epoch if parsing fail. func (pp *Pool) ParseDateTime(key string, val string) (t time.Time) { @@ -138,19 +139,21 @@ func (pp *Pool) ParseDateTime(key string, val string) (t time.Time) { var layout string var lay interface{} var ok bool + if val == "" { + t = Epoch + return + } if lay, ok = pp.knownLayouts.Load(key); !ok { t, layout = parseInLocation(val, pp.timeZone) if layout != "" { pp.knownLayouts.Store(key, layout) return } - pp.knownLayouts.Store(key, nil) } - if lay == nil { + if layout, ok = lay.(string); !ok { t = Epoch return } - layout, _ = lay.(string) if t, err = time.ParseInLocation(layout, val, pp.timeZone); err != nil { t = Epoch return diff --git a/task/task.go b/task/task.go index 8810eaef..ca45c8ef 100644 --- a/task/task.go +++ b/task/task.go @@ -242,6 +242,9 @@ func (service *Service) put(msg model.InputMessage) { if taskCfg.DynamicSchema.Enable { foundNewKeys = metric.GetNewKeys(&service.knownKeys, &service.newKeys) } + // Dumping message and result + //util.Logger.Debug("parsed kafka message", zap.Int("partition", msg.Partition), zap.Int64("offset", msg.Offset), + // zap.String("message value", string(msg.Value)), zap.String("row(spew)", spew.Sdump(row))) } // WARNNING: metric.GetXXX may depend on p. Don't call them after p been freed. service.pp.Put(p)