Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

MB-57973: Validate custom date time parser layout #1877

Merged
merged 1 commit into from
Aug 31, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
127 changes: 127 additions & 0 deletions analysis/datetime/sanitized/sanitized.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
// Copyright (c) 2014 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package sanitized

import (
"fmt"
"regexp"
"time"

"github.com/blevesearch/bleve/v2/analysis"
"github.com/blevesearch/bleve/v2/registry"
)

const Name = "sanitizedgo"

var validMagicNumbers = map[string]struct{}{
"2006": {},
"06": {}, // Year
"01": {},
"1": {},
"_1": {},
"January": {},
"Jan": {}, // Month
"02": {},
"2": {},
"_2": {},
"__2": {},
"002": {},
"Monday": {},
"Mon": {}, // Day
"15": {},
"3": {},
"03": {}, // Hour
"4": {},
"04": {}, // Minute
"5": {},
"05": {}, // Second
"0700": {},
"070000": {},
"07": {},
"00": {},
"": {},
}

var layoutSplitRegex = regexp.MustCompile("[\\+\\-= :T,Z\\.<>;\\?!`~@#$%\\^&\\*|'\"\\(\\){}\\[\\]/\\\\]")

var layoutStripRegex = regexp.MustCompile(`PM|pm|\.9+|\.0+|MST`)

type DateTimeParser struct {
layouts []string
}

func New(layouts []string) *DateTimeParser {
return &DateTimeParser{
layouts: layouts,
}
}

func (p *DateTimeParser) ParseDateTime(input string) (time.Time, string, error) {
for _, layout := range p.layouts {
rv, err := time.Parse(layout, input)
if err == nil {
return rv, layout, nil
}
}
return time.Time{}, "", analysis.ErrInvalidDateTime
}

// date time layouts must be a combination of constants specified in golang time package
// https://pkg.go.dev/time#pkg-constants
// this validation verifies that only these constants are used in the custom layout
// for compatibility with the golang time package
func validateLayout(layout string) bool {
// first we strip out commonly used constants
// such as "PM" which can be present in the layout
// right after a time component, e.g. 03:04PM;
// because regex split cannot separate "03:04PM" into
// "03:04" and "PM". We also strip out ".9+" and ".0+"
// which represent fractional seconds.
layout = layoutStripRegex.ReplaceAllString(layout, "")
// then we split the layout by non-constant characters
// which is a regex and verify that each split is a valid magic number
split := layoutSplitRegex.Split(layout, -1)
for i := range split {
_, found := validMagicNumbers[split[i]]
if !found {
return false
}
}
return true
}

func DateTimeParserConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.DateTimeParser, error) {
layouts, ok := config["layouts"].([]interface{})
if !ok {
return nil, fmt.Errorf("must specify layouts")
}
var layoutStrs []string
for _, layout := range layouts {
layoutStr, ok := layout.(string)
if ok {
if !validateLayout(layoutStr) {
return nil, fmt.Errorf("invalid datetime parser layout: %s,"+
" please refer to https://pkg.go.dev/time#pkg-constants for supported"+
" layouts", layoutStr)
}
layoutStrs = append(layoutStrs, layoutStr)
}
}
return New(layoutStrs), nil
}

func init() {
registry.RegisterDateTimeParser(Name, DateTimeParserConstructor)
}
95 changes: 95 additions & 0 deletions analysis/datetime/sanitized/sanitized_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
package sanitized

import (
"reflect"
"testing"
)

func TestLayoutValidatorRegex(t *testing.T) {
splitRegexTests := []struct {
input string
output []string
}{
{
input: "2014-08-03",
output: []string{"2014", "08", "03"},
},
{
input: "2014-08-03T15:59:30",
output: []string{"2014", "08", "03", "15", "59", "30"},
},
{
input: "2014.08-03 15/59`30",
output: []string{"2014", "08", "03", "15", "59", "30"},
},
{
input: "2014/08/03T15:59:30Z08:00",
output: []string{"2014", "08", "03", "15", "59", "30", "08", "00"},
},
{
input: "2014\\08|03T15=59.30.999999999+08*00",
output: []string{"2014", "08", "03", "15", "59", "30", "999999999", "08", "00"},
},
{
input: "2006-01-02T15:04:05.999999999Z07:00",
output: []string{"2006", "01", "02", "15", "04", "05", "999999999", "07", "00"},
},
{
input: "A-B C:DTE,FZG.H<I>J;K?L!M`N~O@P#Q$R%S^U&V*W|X'Y\"A(B)C{D}E[F]G/H\\I+J=L",
output: []string{"A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O", "P",
"Q", "R", "S", "U", "V", "W", "X", "Y", "A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "L"},
},
}
regex := layoutSplitRegex
for _, test := range splitRegexTests {
t.Run(test.input, func(t *testing.T) {
actualOutput := regex.Split(test.input, -1)
if !reflect.DeepEqual(actualOutput, test.output) {
t.Fatalf("expected output %v, got %v", test.output, actualOutput)
}
})
}

stripRegexTests := []struct {
input string
output string
}{
{
input: "3PM",
output: "3",
},
{
input: "3.0PM",
output: "3",
},
{
input: "3.9AM",
output: "3AM",
},
{
input: "3.999999999pm",
output: "3",
},
{
input: "2006-01-02T15:04:05.999999999Z07:00MST",
output: "2006-01-02T15:04:05Z07:00",
},
{
input: "Jan _2 15:04:05.0000000+07:00MST",
output: "Jan _2 15:04:05+07:00",
},
{
input: "15:04:05.99PM+07:00MST",
output: "15:04:05+07:00",
},
}
regex = layoutStripRegex
for _, test := range stripRegexTests {
t.Run(test.input, func(t *testing.T) {
actualOutput := layoutStripRegex.ReplaceAllString(test.input, "")
if !reflect.DeepEqual(actualOutput, test.output) {
t.Fatalf("expected output %v, got %v", test.output, actualOutput)
}
})
}
}
1 change: 1 addition & 0 deletions config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@ import (
// date time parsers
_ "github.com/blevesearch/bleve/v2/analysis/datetime/flexible"
_ "github.com/blevesearch/bleve/v2/analysis/datetime/optional"
_ "github.com/blevesearch/bleve/v2/analysis/datetime/sanitized"

// languages
_ "github.com/blevesearch/bleve/v2/analysis/lang/ar"
Expand Down
118 changes: 118 additions & 0 deletions search_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@ import (
"github.com/blevesearch/bleve/v2/analysis/analyzer/standard"
html_char_filter "github.com/blevesearch/bleve/v2/analysis/char/html"
regexp_char_filter "github.com/blevesearch/bleve/v2/analysis/char/regexp"
"github.com/blevesearch/bleve/v2/analysis/datetime/flexible"
"github.com/blevesearch/bleve/v2/analysis/datetime/sanitized"
"github.com/blevesearch/bleve/v2/analysis/token/length"
"github.com/blevesearch/bleve/v2/analysis/token/lowercase"
"github.com/blevesearch/bleve/v2/analysis/token/shingle"
Expand Down Expand Up @@ -2355,3 +2357,119 @@ func TestAnalyzerInheritanceForDefaultDynamicMapping(t *testing.T) {
t.Fatalf("expected 1 hit, got %d", len(results.Hits))
}
}

func TestCustomDateTimeParserLayoutValidation(t *testing.T) {
flexiblegoName := flexible.Name
sanitizedgoName := sanitized.Name
imap := mapping.NewIndexMapping()
correctConfig := map[string]interface{}{
"type": sanitizedgoName,
"layouts": []interface{}{
// some custom layouts
"2006-01-02 15:04:05.0000",
"2006\\01\\02T03:04:05PM",
"2006/01/02",
"2006-01-02T15:04:05.999Z0700PMMST",
"15:04:05.0000Z07:00 Monday",

// standard layouts
time.Layout,
time.ANSIC,
time.UnixDate,
time.RubyDate,
time.RFC822,
time.RFC822Z,
time.RFC850,
time.RFC1123,
time.RFC1123Z,
time.RFC3339,
time.RFC3339Nano,
time.Kitchen,
time.Stamp,
time.StampMilli,
time.StampMicro,
time.StampNano,
"2006-01-02 15:04:05", //time.DateTime
"2006-01-02", //time.DateOnly
"15:04:05", //time.TimeOnly

// Corrected layouts to the incorrect ones below.
"2006-01-02 03:04:05 -0700",
"2006-01-02 15:04:05 -0700",
"3:04PM",
"2006-01-02 15:04:05.000 -0700 MST",
"January 2 2006 3:04 PM",
"02/Jan/06 3:04PM",
"Mon 02 Jan 3:04:05 PM",
},
}

// Correct layouts - sanitizedgo should work without errors.
err := imap.AddCustomDateTimeParser("custDT", correctConfig)
if err != nil {
t.Fatalf("expected no error, got: %v", err)
}
// Flexiblego should work without errors as well.
correctConfig["type"] = flexiblegoName
err = imap.AddCustomDateTimeParser("custDT_Flexi", correctConfig)
if err != nil {
t.Fatalf("expected no error, got: %v", err)
}

incorrectLayouts := [][]interface{}{
{
"2000-03-31 01:33:51 +0300",
},
{
"2006-01-02 15:04:51 +0300",
},
{
"2000-03-31 01:33:05 +0300",
},
{
"4:45PM",
},
{
"2006-01-02 15:04:05.445 -0700 MST",
},
{
"August 20 2001 8:55 AM",
},
{
"28/Jul/23 12:48PM",
},
{
"Tue 22 Aug 6:37:30 AM",
},
}

// first check sanitizedgo, should throw error for each of the incorrect layouts.
numExpectedErrors := len(incorrectLayouts)
numActualErrors := 0
for idx, badLayout := range incorrectLayouts {
incorrectConfig := map[string]interface{}{
"type": sanitizedgoName,
"layouts": badLayout,
}
err := imap.AddCustomDateTimeParser(fmt.Sprintf("%d_DT", idx), incorrectConfig)
if err != nil {
numActualErrors++
}
}
// Expecting all layouts to be incorrect, since sanitizedgo is being used.
if numActualErrors != numExpectedErrors {
t.Fatalf("expected %d errors, got: %d", numExpectedErrors, numActualErrors)
}

// sanity test - flexiblego should still allow the incorrect layouts, for legacy purposes
for idx, badLayout := range incorrectLayouts {
incorrectConfig := map[string]interface{}{
"type": flexiblegoName,
"layouts": badLayout,
}
err := imap.AddCustomDateTimeParser(fmt.Sprintf("%d_DT_Flexi", idx), incorrectConfig)
if err != nil {
t.Fatalf("expected no error, got: %v", err)
}
}
}