From bdc0a119549ff423943d78b0d21aebda98cb3337 Mon Sep 17 00:00:00 2001 From: Rahul Rampure Date: Thu, 31 Aug 2023 23:25:11 +0530 Subject: [PATCH] MB-57973: Validate custom date time parser layout (#1877) * Introduced 'sanitizedgo' datetime parser type. It validates custom layouts, allowing only Golang time package-related numeric elements." --- analysis/datetime/sanitized/sanitized.go | 127 ++++++++++++++++++ analysis/datetime/sanitized/sanitized_test.go | 95 +++++++++++++ config/config.go | 1 + search_test.go | 118 ++++++++++++++++ 4 files changed, 341 insertions(+) create mode 100644 analysis/datetime/sanitized/sanitized.go create mode 100644 analysis/datetime/sanitized/sanitized_test.go diff --git a/analysis/datetime/sanitized/sanitized.go b/analysis/datetime/sanitized/sanitized.go new file mode 100644 index 000000000..33d271e6e --- /dev/null +++ b/analysis/datetime/sanitized/sanitized.go @@ -0,0 +1,127 @@ +// Copyright (c) 2014 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package sanitized + +import ( + "fmt" + "regexp" + "time" + + "github.com/blevesearch/bleve/v2/analysis" + "github.com/blevesearch/bleve/v2/registry" +) + +const Name = "sanitizedgo" + +var validMagicNumbers = map[string]struct{}{ + "2006": {}, + "06": {}, // Year + "01": {}, + "1": {}, + "_1": {}, + "January": {}, + "Jan": {}, // Month + "02": {}, + "2": {}, + "_2": {}, + "__2": {}, + "002": {}, + "Monday": {}, + "Mon": {}, // Day + "15": {}, + "3": {}, + "03": {}, // Hour + "4": {}, + "04": {}, // Minute + "5": {}, + "05": {}, // Second + "0700": {}, + "070000": {}, + "07": {}, + "00": {}, + "": {}, +} + +var layoutSplitRegex = regexp.MustCompile("[\\+\\-= :T,Z\\.<>;\\?!`~@#$%\\^&\\*|'\"\\(\\){}\\[\\]/\\\\]") + +var layoutStripRegex = regexp.MustCompile(`PM|pm|\.9+|\.0+|MST`) + +type DateTimeParser struct { + layouts []string +} + +func New(layouts []string) *DateTimeParser { + return &DateTimeParser{ + layouts: layouts, + } +} + +func (p *DateTimeParser) ParseDateTime(input string) (time.Time, string, error) { + for _, layout := range p.layouts { + rv, err := time.Parse(layout, input) + if err == nil { + return rv, layout, nil + } + } + return time.Time{}, "", analysis.ErrInvalidDateTime +} + +// date time layouts must be a combination of constants specified in golang time package +// https://pkg.go.dev/time#pkg-constants +// this validation verifies that only these constants are used in the custom layout +// for compatibility with the golang time package +func validateLayout(layout string) bool { + // first we strip out commonly used constants + // such as "PM" which can be present in the layout + // right after a time component, e.g. 03:04PM; + // because regex split cannot separate "03:04PM" into + // "03:04" and "PM". We also strip out ".9+" and ".0+" + // which represent fractional seconds. + layout = layoutStripRegex.ReplaceAllString(layout, "") + // then we split the layout by non-constant characters + // which is a regex and verify that each split is a valid magic number + split := layoutSplitRegex.Split(layout, -1) + for i := range split { + _, found := validMagicNumbers[split[i]] + if !found { + return false + } + } + return true +} + +func DateTimeParserConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.DateTimeParser, error) { + layouts, ok := config["layouts"].([]interface{}) + if !ok { + return nil, fmt.Errorf("must specify layouts") + } + var layoutStrs []string + for _, layout := range layouts { + layoutStr, ok := layout.(string) + if ok { + if !validateLayout(layoutStr) { + return nil, fmt.Errorf("invalid datetime parser layout: %s,"+ + " please refer to https://pkg.go.dev/time#pkg-constants for supported"+ + " layouts", layoutStr) + } + layoutStrs = append(layoutStrs, layoutStr) + } + } + return New(layoutStrs), nil +} + +func init() { + registry.RegisterDateTimeParser(Name, DateTimeParserConstructor) +} diff --git a/analysis/datetime/sanitized/sanitized_test.go b/analysis/datetime/sanitized/sanitized_test.go new file mode 100644 index 000000000..d680b248b --- /dev/null +++ b/analysis/datetime/sanitized/sanitized_test.go @@ -0,0 +1,95 @@ +package sanitized + +import ( + "reflect" + "testing" +) + +func TestLayoutValidatorRegex(t *testing.T) { + splitRegexTests := []struct { + input string + output []string + }{ + { + input: "2014-08-03", + output: []string{"2014", "08", "03"}, + }, + { + input: "2014-08-03T15:59:30", + output: []string{"2014", "08", "03", "15", "59", "30"}, + }, + { + input: "2014.08-03 15/59`30", + output: []string{"2014", "08", "03", "15", "59", "30"}, + }, + { + input: "2014/08/03T15:59:30Z08:00", + output: []string{"2014", "08", "03", "15", "59", "30", "08", "00"}, + }, + { + input: "2014\\08|03T15=59.30.999999999+08*00", + output: []string{"2014", "08", "03", "15", "59", "30", "999999999", "08", "00"}, + }, + { + input: "2006-01-02T15:04:05.999999999Z07:00", + output: []string{"2006", "01", "02", "15", "04", "05", "999999999", "07", "00"}, + }, + { + input: "A-B C:DTE,FZG.HJ;K?L!M`N~O@P#Q$R%S^U&V*W|X'Y\"A(B)C{D}E[F]G/H\\I+J=L", + output: []string{"A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O", "P", + "Q", "R", "S", "U", "V", "W", "X", "Y", "A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "L"}, + }, + } + regex := layoutSplitRegex + for _, test := range splitRegexTests { + t.Run(test.input, func(t *testing.T) { + actualOutput := regex.Split(test.input, -1) + if !reflect.DeepEqual(actualOutput, test.output) { + t.Fatalf("expected output %v, got %v", test.output, actualOutput) + } + }) + } + + stripRegexTests := []struct { + input string + output string + }{ + { + input: "3PM", + output: "3", + }, + { + input: "3.0PM", + output: "3", + }, + { + input: "3.9AM", + output: "3AM", + }, + { + input: "3.999999999pm", + output: "3", + }, + { + input: "2006-01-02T15:04:05.999999999Z07:00MST", + output: "2006-01-02T15:04:05Z07:00", + }, + { + input: "Jan _2 15:04:05.0000000+07:00MST", + output: "Jan _2 15:04:05+07:00", + }, + { + input: "15:04:05.99PM+07:00MST", + output: "15:04:05+07:00", + }, + } + regex = layoutStripRegex + for _, test := range stripRegexTests { + t.Run(test.input, func(t *testing.T) { + actualOutput := layoutStripRegex.ReplaceAllString(test.input, "") + if !reflect.DeepEqual(actualOutput, test.output) { + t.Fatalf("expected output %v, got %v", test.output, actualOutput) + } + }) + } +} diff --git a/config/config.go b/config/config.go index d098cabbb..e30fe48ff 100644 --- a/config/config.go +++ b/config/config.go @@ -71,6 +71,7 @@ import ( // date time parsers _ "github.com/blevesearch/bleve/v2/analysis/datetime/flexible" _ "github.com/blevesearch/bleve/v2/analysis/datetime/optional" + _ "github.com/blevesearch/bleve/v2/analysis/datetime/sanitized" // languages _ "github.com/blevesearch/bleve/v2/analysis/lang/ar" diff --git a/search_test.go b/search_test.go index cdd5a4507..414c907c8 100644 --- a/search_test.go +++ b/search_test.go @@ -29,6 +29,8 @@ import ( "github.com/blevesearch/bleve/v2/analysis/analyzer/standard" html_char_filter "github.com/blevesearch/bleve/v2/analysis/char/html" regexp_char_filter "github.com/blevesearch/bleve/v2/analysis/char/regexp" + "github.com/blevesearch/bleve/v2/analysis/datetime/flexible" + "github.com/blevesearch/bleve/v2/analysis/datetime/sanitized" "github.com/blevesearch/bleve/v2/analysis/token/length" "github.com/blevesearch/bleve/v2/analysis/token/lowercase" "github.com/blevesearch/bleve/v2/analysis/token/shingle" @@ -2355,3 +2357,119 @@ func TestAnalyzerInheritanceForDefaultDynamicMapping(t *testing.T) { t.Fatalf("expected 1 hit, got %d", len(results.Hits)) } } + +func TestCustomDateTimeParserLayoutValidation(t *testing.T) { + flexiblegoName := flexible.Name + sanitizedgoName := sanitized.Name + imap := mapping.NewIndexMapping() + correctConfig := map[string]interface{}{ + "type": sanitizedgoName, + "layouts": []interface{}{ + // some custom layouts + "2006-01-02 15:04:05.0000", + "2006\\01\\02T03:04:05PM", + "2006/01/02", + "2006-01-02T15:04:05.999Z0700PMMST", + "15:04:05.0000Z07:00 Monday", + + // standard layouts + time.Layout, + time.ANSIC, + time.UnixDate, + time.RubyDate, + time.RFC822, + time.RFC822Z, + time.RFC850, + time.RFC1123, + time.RFC1123Z, + time.RFC3339, + time.RFC3339Nano, + time.Kitchen, + time.Stamp, + time.StampMilli, + time.StampMicro, + time.StampNano, + "2006-01-02 15:04:05", //time.DateTime + "2006-01-02", //time.DateOnly + "15:04:05", //time.TimeOnly + + // Corrected layouts to the incorrect ones below. + "2006-01-02 03:04:05 -0700", + "2006-01-02 15:04:05 -0700", + "3:04PM", + "2006-01-02 15:04:05.000 -0700 MST", + "January 2 2006 3:04 PM", + "02/Jan/06 3:04PM", + "Mon 02 Jan 3:04:05 PM", + }, + } + + // Correct layouts - sanitizedgo should work without errors. + err := imap.AddCustomDateTimeParser("custDT", correctConfig) + if err != nil { + t.Fatalf("expected no error, got: %v", err) + } + // Flexiblego should work without errors as well. + correctConfig["type"] = flexiblegoName + err = imap.AddCustomDateTimeParser("custDT_Flexi", correctConfig) + if err != nil { + t.Fatalf("expected no error, got: %v", err) + } + + incorrectLayouts := [][]interface{}{ + { + "2000-03-31 01:33:51 +0300", + }, + { + "2006-01-02 15:04:51 +0300", + }, + { + "2000-03-31 01:33:05 +0300", + }, + { + "4:45PM", + }, + { + "2006-01-02 15:04:05.445 -0700 MST", + }, + { + "August 20 2001 8:55 AM", + }, + { + "28/Jul/23 12:48PM", + }, + { + "Tue 22 Aug 6:37:30 AM", + }, + } + + // first check sanitizedgo, should throw error for each of the incorrect layouts. + numExpectedErrors := len(incorrectLayouts) + numActualErrors := 0 + for idx, badLayout := range incorrectLayouts { + incorrectConfig := map[string]interface{}{ + "type": sanitizedgoName, + "layouts": badLayout, + } + err := imap.AddCustomDateTimeParser(fmt.Sprintf("%d_DT", idx), incorrectConfig) + if err != nil { + numActualErrors++ + } + } + // Expecting all layouts to be incorrect, since sanitizedgo is being used. + if numActualErrors != numExpectedErrors { + t.Fatalf("expected %d errors, got: %d", numExpectedErrors, numActualErrors) + } + + // sanity test - flexiblego should still allow the incorrect layouts, for legacy purposes + for idx, badLayout := range incorrectLayouts { + incorrectConfig := map[string]interface{}{ + "type": flexiblegoName, + "layouts": badLayout, + } + err := imap.AddCustomDateTimeParser(fmt.Sprintf("%d_DT_Flexi", idx), incorrectConfig) + if err != nil { + t.Fatalf("expected no error, got: %v", err) + } + } +}