From f42e909f4536437e0d715c069a5a8e6f4b34f81f Mon Sep 17 00:00:00 2001 From: Andrea Spacca Date: Thu, 26 Jan 2023 10:55:29 +0900 Subject: [PATCH] allow more customisation in cardinality, fuzziness and range (#46) * allow more customisation in cardinality, fuzziness and range * allow int values for range of type double * Update README.md Co-authored-by: Edoardo Tenani <526307+endorama@users.noreply.github.com> * Update README.md Co-authored-by: Edoardo Tenani <526307+endorama@users.noreply.github.com> Co-authored-by: Edoardo Tenani <526307+endorama@users.noreply.github.com> --- README.md | 78 ++++++++---- assets/templates/aws.vpcflow/vpcflow.conf.yml | 40 ++++-- pkg/genlib/config/config.go | 16 ++- pkg/genlib/generator_interface.go | 117 ++++++++++++++---- pkg/genlib/generator_test.go | 72 ++++++++--- .../generator_with_custom_template_test.go | 32 +++-- .../generator_with_text_template_test.go | 31 +++-- 7 files changed, 282 insertions(+), 104 deletions(-) diff --git a/README.md b/README.md index bb652a3..c5b8dfb 100644 --- a/README.md +++ b/README.md @@ -88,7 +88,9 @@ Given the above template, in the fields definition file you'll have to define an In the config file you can define all of only a subset of the fields used in the template, according to how you need to customise their behaviour, example: ```yaml - name: Field1 - cardinality: 10 + cardinality: + numerator: 1 + denominator: 100 - name: Field3 enum: ["value1", "value2"] ``` @@ -159,22 +161,40 @@ And the following config file content: - name: AccountID value: 123456789012 - name: InterfaceID - cardinality: 10 + cardinality: + numerator: 1 + denominator: 100 - name: SrcAddr - cardinality: 1 + cardinality: + numerator: 1 + denominator: 1000 - name: DstAddr - cardinality: 100 + cardinality: + numerator: 1 + denominator: 10 - name: SrcPort - range: 65535 + range: + min: 0 + max: 65535 - name: DstPort - range: 65535 - cardinality: 100 + range: + min: 0 + max: 65535 + cardinality: + numerator: 1 + denominator: 10 - name: Protocol - range: 256 + range: + min: 1 + max: 256 - name: Packets - range: 1048576 + range: + min: 1 + max: 1048576 - name: StartOffset - range: 60 + range: + min: 1 + max: 60 - name: Action enum: ["ACCEPT", "REJECT"] - name: LogStatus @@ -188,16 +208,30 @@ It is possible to tweak the randomness of the generated data through a config fi ##### Sample config ```yaml - name: aws.dynamodb.metrics.AccountMaxReads.max - fuzziness: 10 - range: 100 + fuzziness: + numerator: 1 + denominator: 10 + range: + min: 0 + max: 100 - name: aws.dynamodb.metrics.AccountMaxTableLevelReads.max - fuzziness: 5 - range: 50 - cardinality: 50 + fuzziness: + numerator: 1 + denominator: 20 + range: + min: 0 + max: 50 + cardinality: + numerator: 1 + denominator: 20 - name: aws.dynamodb.metrics.AccountProvisionedReadCapacityUtilization.avg - fuzziness: 10 + fuzziness: + numerator: 1 + denominator: 10 - name: aws.cloudwatch.namespace - cardinality: 1 + cardinality: + numerator: 1 + denominator: 1000 - name: aws.dimensions.* object_keys: - TableName @@ -211,16 +245,18 @@ It is possible to tweak the randomness of the generated data through a config fi - name: aws.dimensions.TableName enum: ["table1", "table2"] - name: aws.dimensions.Operation - cardinality: 500 + cardinality: + numerator: 1 + denominator: 2 ``` #### Config entries definition The config file is a yaml file consisting of an array of config entry. For each config entry the following fields are available - `name` *mandatory*: dotted path field -- `fuzziness` *optional (`long` and `double` type only)*: delta from the previous generated value for the same field -- `range` *optional (`long` and `double` type only)*: value will be generated between 0 and range -- `cardinality` *optional*: per-mille distribution of different values for the field +- `fuzziness` *optional (`long` and `double` type only)*: delta from the previous generated value for the same field, expressed as a ratio between a `numerator` and a `denominator` +- `range` *optional (`long` and `double` type only)*: value will be generated between `min` and `max` +- `cardinality` *optional*: distribution of different values for the field, expressed as a ratio between a `numerator` and a `denominator` - `object_keys` *optional (`object` type only)*: list of field names to generate in a object field type. if not specified a random number of field names will be generated in the object filed type. - `value` *optional*: hardcoded value to set for the field (any `cardinality` will be ignored) - `enum` *optional* (`keyword` type only)*: list of strings to randomly chose from a value to set for the field (any `cardinality` will be ignored) diff --git a/assets/templates/aws.vpcflow/vpcflow.conf.yml b/assets/templates/aws.vpcflow/vpcflow.conf.yml index 1e27759..a5aa2df 100644 --- a/assets/templates/aws.vpcflow/vpcflow.conf.yml +++ b/assets/templates/aws.vpcflow/vpcflow.conf.yml @@ -3,24 +3,44 @@ - name: AccountID value: 627286350134 - name: InterfaceID - cardinality: 10 + cardinality: + numerator: 1 + denominator: 100 - name: SrcAddr - cardinality: 1 + cardinality: + numerator: 1 + denominator: 1000 - name: DstAddr - cardinality: 100 + cardinality: + numerator: 1 + denominator: 10 - name: SrcPort - range: 65535 + range: + min: 0 + max: 65535 - name: DstPort - range: 65535 - cardinality: 100 + range: + min: 0 + max: 65535 + cardinality: + numerator: 1 + denominator: 10 - name: Protocol - range: 256 + range: + min: 1 + max: 256 - name: Packets - range: 1048576 + range: + min: 1 + max: 1048576 - name: Bytes - range: 15728640 + range: + min: 1 + max: 15728640 - name: StartOffset - range: 60 + range: + min: 1 + max: 60 - name: Action enum: ["ACCEPT", "REJECT"] - name: LogStatus diff --git a/pkg/genlib/config/config.go b/pkg/genlib/config/config.go index b6eb67a..709b24b 100644 --- a/pkg/genlib/config/config.go +++ b/pkg/genlib/config/config.go @@ -6,15 +6,25 @@ import ( "os" ) +type Ratio struct { + Numerator int `config:"numerator"` + Denominator int `config:"denominator"` +} + +type Range struct { + Min interface{} `config:"min"` + Max interface{} `config:"max"` +} + type Config struct { m map[string]ConfigField } type ConfigField struct { Name string `config:"name"` - Fuzziness int `config:"fuzziness"` - Range int `config:"range"` - Cardinality int `config:"cardinality"` + Fuzziness Ratio `config:"fuzziness"` + Range Range `config:"range"` + Cardinality Ratio `config:"cardinality"` Enum []string `config:"enum"` ObjectKeys []string `config:"object_keys"` Value interface{} `config:"value"` diff --git a/pkg/genlib/generator_interface.go b/pkg/genlib/generator_interface.go index 04ad2df..17951a6 100644 --- a/pkg/genlib/generator_interface.go +++ b/pkg/genlib/generator_interface.go @@ -114,7 +114,7 @@ func bindField(cfg Config, field Field, fieldMapWithReturn map[string]EmitF, fie } } - if fieldCfg.Cardinality > 0 { + if fieldCfg.Cardinality.Numerator > 0 { if withReturn { return bindCardinalityWithReturn(cfg, field, fieldMapWithReturn) } else { @@ -213,14 +213,69 @@ func bindByTypeWithReturn(cfg Config, field Field, fieldMap map[string]EmitF) (e return } +func makeFloatFunc(fieldCfg ConfigField, field Field) func() float64 { + minValue := float64(0) + maxValue := float64(0) + + switch fieldCfg.Range.Min.(type) { + case float64: + minValue = fieldCfg.Range.Min.(float64) + case uint64: + minValue = float64(fieldCfg.Range.Min.(uint64)) + case int64: + minValue = float64(fieldCfg.Range.Min.(int64)) + } + + switch fieldCfg.Range.Max.(type) { + case float64: + maxValue = fieldCfg.Range.Max.(float64) + case uint64: + maxValue = float64(fieldCfg.Range.Max.(uint64)) + case int64: + maxValue = float64(fieldCfg.Range.Max.(int64)) + } + + var dummyFunc func() float64 + + switch { + case maxValue > 0: + dummyFunc = func() float64 { return minValue + rand.Float64()*(maxValue-minValue) } + case len(field.Example) == 0: + dummyFunc = func() float64 { return rand.Float64() * 10 } + default: + totDigit := len(field.Example) + max := math.Pow10(totDigit) + dummyFunc = func() float64 { + return rand.Float64() * max + } + } + + return dummyFunc +} + func makeIntFunc(fieldCfg ConfigField, field Field) func() int { - maxValue := fieldCfg.Range + minValue := 0 + maxValue := 0 + + switch fieldCfg.Range.Min.(type) { + case uint64: + minValue = int(fieldCfg.Range.Min.(uint64)) + case int64: + minValue = int(fieldCfg.Range.Min.(int64)) + } + + switch fieldCfg.Range.Max.(type) { + case uint64: + maxValue = int(fieldCfg.Range.Max.(uint64)) + case int64: + maxValue = int(fieldCfg.Range.Max.(int64)) + } var dummyFunc func() int switch { case maxValue > 0: - dummyFunc = func() int { return rand.Intn(maxValue) } + dummyFunc = func() int { return rand.Intn(maxValue-minValue) + minValue } case len(field.Example) == 0: dummyFunc = func() int { return rand.Intn(10) } default: @@ -470,9 +525,10 @@ func bindLong(prefix []byte, fieldCfg ConfigField, field Field, fieldMap map[str dummyFunc := makeIntFunc(fieldCfg, field) - fuzziness := fieldCfg.Fuzziness + fuzzinessNumerator := fieldCfg.Fuzziness.Numerator + fuzzinessDenominator := float64(fieldCfg.Fuzziness.Denominator) - if fuzziness <= 0 { + if fuzzinessNumerator <= 0 { fieldMap[field.Name] = func(state *GenState, buf *bytes.Buffer) error { buf.Write(prefix) v := make([]byte, 0, 32) @@ -487,9 +543,11 @@ func bindLong(prefix []byte, fieldCfg ConfigField, field Field, fieldMap map[str fieldMap[field.Name] = func(state *GenState, buf *bytes.Buffer) error { dummyInt := dummyFunc() if previousDummyInt, ok := state.prevCache[field.Name].(int); ok { - adjustedRatio := 1. - float64(rand.Intn(fuzziness))/100. + adjustedRatio := float64(rand.Intn(fuzzinessNumerator)) / fuzzinessDenominator if rand.Int()%2 == 0 { - adjustedRatio = 1. + float64(rand.Intn(fuzziness))/100. + adjustedRatio += 1. + } else { + adjustedRatio = 1. - adjustedRatio } dummyInt = int(math.Ceil(float64(previousDummyInt) * adjustedRatio)) } @@ -506,13 +564,14 @@ func bindLong(prefix []byte, fieldCfg ConfigField, field Field, fieldMap map[str func bindDouble(prefix []byte, fieldCfg ConfigField, field Field, fieldMap map[string]emitFNotReturn) error { - dummyFunc := makeIntFunc(fieldCfg, field) + dummyFunc := makeFloatFunc(fieldCfg, field) - fuzziness := fieldCfg.Fuzziness + fuzzinessNumerator := fieldCfg.Fuzziness.Numerator + fuzzinessDenominator := float64(fieldCfg.Fuzziness.Denominator) - if fuzziness <= 0 { + if fuzzinessNumerator <= 0 { fieldMap[field.Name] = func(state *GenState, buf *bytes.Buffer) error { - dummyFloat := float64(dummyFunc()) / rand.Float64() + dummyFloat := dummyFunc() buf.Write(prefix) _, err := fmt.Fprintf(buf, "%f", dummyFloat) return err @@ -522,11 +581,13 @@ func bindDouble(prefix []byte, fieldCfg ConfigField, field Field, fieldMap map[s } fieldMap[field.Name] = func(state *GenState, buf *bytes.Buffer) error { - dummyFloat := float64(dummyFunc()) / rand.Float64() + dummyFloat := dummyFunc() if previousDummyFloat, ok := state.prevCache[field.Name].(float64); ok { - adjustedRatio := 1. - float64(rand.Intn(fuzziness))/100. + adjustedRatio := float64(rand.Intn(fuzzinessNumerator)) / fuzzinessDenominator if rand.Int()%2 == 0 { - adjustedRatio = 1. + float64(rand.Intn(fuzziness))/100. + adjustedRatio += 1. + } else { + adjustedRatio = 1. - adjustedRatio } dummyFloat = previousDummyFloat * adjustedRatio } @@ -542,7 +603,7 @@ func bindDouble(prefix []byte, fieldCfg ConfigField, field Field, fieldMap map[s func bindCardinality(prefix []byte, cfg Config, field Field, fieldMap map[string]emitFNotReturn, templateFieldMap map[string][]byte) error { fieldCfg, _ := cfg.GetField(field.Name) - cardinality := int(math.Ceil((1000. / float64(fieldCfg.Cardinality)))) + cardinality := int(math.Ceil((float64(fieldCfg.Cardinality.Denominator) / float64(fieldCfg.Cardinality.Numerator)))) if strings.HasSuffix(field.Name, ".*") { field.Name = replacer.Replace(field.Name) @@ -762,9 +823,10 @@ func bindLongWithReturn(fieldCfg ConfigField, field Field, fieldMap map[string]E dummyFunc := makeIntFunc(fieldCfg, field) - fuzziness := fieldCfg.Fuzziness + fuzzinessNumerator := fieldCfg.Fuzziness.Numerator + fuzzinessDenominator := float64(fieldCfg.Fuzziness.Denominator) - if fuzziness <= 0 { + if fuzzinessNumerator <= 0 { fieldMap[field.Name] = func(state *GenState, buf *bytes.Buffer) (interface{}, error) { return dummyFunc(), nil } @@ -775,9 +837,9 @@ func bindLongWithReturn(fieldCfg ConfigField, field Field, fieldMap map[string]E fieldMap[field.Name] = func(state *GenState, buf *bytes.Buffer) (interface{}, error) { dummyInt := dummyFunc() if previousDummyInt, ok := state.prevCache[field.Name].(int); ok { - adjustedRatio := 1. - float64(rand.Intn(fuzziness))/100. + adjustedRatio := 1. - float64(rand.Intn(fuzzinessNumerator))/fuzzinessDenominator if rand.Int()%2 == 0 { - adjustedRatio = 1. + float64(rand.Intn(fuzziness))/100. + adjustedRatio = 1. + float64(rand.Intn(fuzzinessNumerator))/fuzzinessDenominator } dummyInt = int(math.Ceil(float64(previousDummyInt) * adjustedRatio)) } @@ -790,24 +852,25 @@ func bindLongWithReturn(fieldCfg ConfigField, field Field, fieldMap map[string]E func bindDoubleWithReturn(fieldCfg ConfigField, field Field, fieldMap map[string]EmitF) error { - dummyFunc := makeIntFunc(fieldCfg, field) + dummyFunc := makeFloatFunc(fieldCfg, field) - fuzziness := fieldCfg.Fuzziness + fuzzinessNumerator := fieldCfg.Fuzziness.Numerator + fuzzinessDenominator := float64(fieldCfg.Fuzziness.Denominator) - if fuzziness <= 0 { + if fuzzinessNumerator <= 0 { fieldMap[field.Name] = func(state *GenState, buf *bytes.Buffer) (interface{}, error) { - return float64(dummyFunc()) / rand.Float64(), nil + return dummyFunc(), nil } return nil } fieldMap[field.Name] = func(state *GenState, buf *bytes.Buffer) (interface{}, error) { - dummyFloat := float64(dummyFunc()) / rand.Float64() + dummyFloat := dummyFunc() if previousDummyFloat, ok := state.prevCache[field.Name].(float64); ok { - adjustedRatio := 1. - float64(rand.Intn(fuzziness))/100. + adjustedRatio := 1. - float64(rand.Intn(fuzzinessNumerator))/fuzzinessDenominator if rand.Int()%2 == 0 { - adjustedRatio = 1. + float64(rand.Intn(fuzziness))/100. + adjustedRatio = 1. + float64(rand.Intn(fuzzinessNumerator))/fuzzinessDenominator } dummyFloat = previousDummyFloat * adjustedRatio } @@ -821,7 +884,7 @@ func bindDoubleWithReturn(fieldCfg ConfigField, field Field, fieldMap map[string func bindCardinalityWithReturn(cfg Config, field Field, fieldMap map[string]EmitF) error { fieldCfg, _ := cfg.GetField(field.Name) - cardinality := int(math.Ceil((1000. / float64(fieldCfg.Cardinality)))) + cardinality := int(math.Ceil((float64(fieldCfg.Cardinality.Denominator) / float64(fieldCfg.Cardinality.Numerator)))) if strings.HasSuffix(field.Name, ".*") { field.Name = replacer.Replace(field.Name) diff --git a/pkg/genlib/generator_test.go b/pkg/genlib/generator_test.go index b81d171..e2af4e7 100644 --- a/pkg/genlib/generator_test.go +++ b/pkg/genlib/generator_test.go @@ -124,22 +124,40 @@ func Benchmark_GeneratorCustomTemplateVPCFlowLogs(b *testing.B) { - name: AccountID value: 627286350134 - name: InterfaceID - cardinality: 10 + cardinality: + numerator: 1 + denominator: 100 - name: SrcAddr - cardinality: 1 + cardinality: + numerator: 1 + denominator: 1000 - name: DstAddr - cardinality: 100 + cardinality: + numerator: 1 + denominator: 10 - name: SrcPort - range: 65535 + range: + min: 0 + max: 65535 - name: DstPort - range: 65535 - cardinality: 100 + range: + min: 0 + max: 65535 + cardinality: + numerator: 1 + denominator: 10 - name: Protocol - range: 256 + range: + min: 1 + max: 256 - name: Packets - range: 1048576 + range: + min: 1 + max: 1048576 - name: Bytes - range: 15728640 + range: + min: 1 + max: 15728640 - name: Action enum: ["ACCEPT", "REJECT"] - name: LogStatus @@ -236,22 +254,40 @@ func Benchmark_GeneratorTextTemplateVPCFlowLogs(b *testing.B) { - name: AccountID value: 627286350134 - name: InterfaceID - cardinality: 10 + cardinality: + numerator: 1 + denominator: 100 - name: SrcAddr - cardinality: 1 + cardinality: + numerator: 1 + denominator: 1000 - name: DstAddr - cardinality: 100 + cardinality: + numerator: 1 + denominator: 10 - name: SrcPort - range: 65535 + range: + min: 0 + max: 65535 - name: DstPort - range: 65535 - cardinality: 100 + range: + min: 0 + max: 65535 + cardinality: + numerator: 1 + denominator: 10 - name: Protocol - range: 256 + range: + min: 1 + max: 256 - name: Packets - range: 1048576 + range: + min: 1 + max: 1048576 - name: Bytes - range: 15728640 + range: + min: 1 + max: 15728640 - name: Action enum: ["ACCEPT", "REJECT"] - name: LogStatus diff --git a/pkg/genlib/generator_with_custom_template_test.go b/pkg/genlib/generator_with_custom_template_test.go index 5c39324..4809e2b 100644 --- a/pkg/genlib/generator_with_custom_template_test.go +++ b/pkg/genlib/generator_with_custom_template_test.go @@ -13,16 +13,6 @@ import ( "github.com/elastic/elastic-integration-corpus-generator-tool/pkg/genlib/config" ) -/* -const cardinalityCfg = ` -- name: event.id - cardinality: 250 -- name: process.pid - fuzziness: 10 - range: 100 -` -*/ - func Test_ParseTemplate(t *testing.T) { testCases := []struct { template []byte @@ -228,10 +218,26 @@ func test_CardinalityTWithCustomTemplate[T any](t *testing.T, ty string) { // It's cardinality per mille, so a bit confusing :shrug: for cardinality := 1000; cardinality >= 10; cardinality /= 10 { + cardinalityDenominator := 1000 + cardinalityNumerator := cardinality + cardinalityModule := cardinalityDenominator % cardinality + if cardinalityModule == 0 { + cardinalityNumerator = 1 + cardinalityDenominator /= cardinality + } + + rangeTrailing := "" + if ty == FieldTypeFloat { + rangeTrailing = "." + } + + rangeMin := rand.Intn(100) + rangeMax := rand.Intn(10000-rangeMin) + rangeMin + // Add the range to get some variety in integers - tmpl := "- name: alpha\n cardinality: %d\n range: 10000" - yaml := []byte(fmt.Sprintf(tmpl, cardinality)) + tmpl := "- name: alpha\n cardinality:\n numerator: %d\n denominator: %d\n range:\n min: %d%s\n max: %d%s" + yaml := []byte(fmt.Sprintf(tmpl, cardinalityNumerator, cardinalityDenominator, rangeMin, rangeTrailing, rangeMax, rangeTrailing)) cfg, err := config.LoadConfigFromYaml(yaml) if err != nil { t.Fatal(err) @@ -336,7 +342,7 @@ func Test_FieldStaticOverrideNumericWithCustomTemplate(t *testing.T) { Type: FieldTypeKeyword, } - yaml := []byte("- name: alpha\n value: 33") + yaml := []byte("- name: alpha\n value: 33.") template := []byte(`{"alpha":{{.alpha}}}`) t.Logf("with template: %s", string(template)) b := testSingleTWithCustomTemplate[float64](t, fld, yaml, template) diff --git a/pkg/genlib/generator_with_text_template_test.go b/pkg/genlib/generator_with_text_template_test.go index 28ad71f..5ab1c1b 100644 --- a/pkg/genlib/generator_with_text_template_test.go +++ b/pkg/genlib/generator_with_text_template_test.go @@ -13,16 +13,6 @@ import ( "github.com/elastic/elastic-integration-corpus-generator-tool/pkg/genlib/config" ) -/* -const cardinalityCfg = ` -- name: event.id - cardinality: 250 -- name: process.pid - fuzziness: 10 - range: 100 -` -*/ - func Test_EmptyCaseWithTextTemplate(t *testing.T) { template, _ := generateTextTemplateFromField(Config{}, []Field{}) t.Logf("with template: %s", string(template)) @@ -63,9 +53,26 @@ func test_CardinalityTWithTextTemplate[T any](t *testing.T, ty string) { // It's cardinality per mille, so a bit confusing :shrug: for cardinality := 1000; cardinality >= 10; cardinality /= 10 { + cardinalityDenominator := 1000 + cardinalityNumerator := cardinality + cardinalityModule := cardinalityDenominator % cardinality + if cardinalityModule == 0 { + cardinalityNumerator = 1 + cardinalityDenominator /= cardinality + } + + rangeTrailing := "" + if ty == FieldTypeFloat { + rangeTrailing = "." + } + + rangeMin := rand.Intn(100) + rangeMax := rand.Intn(10000-rangeMin) + rangeMin + // Add the range to get some variety in integers - tmpl := "- name: alpha\n cardinality: %d\n range: 10000" - yaml := []byte(fmt.Sprintf(tmpl, cardinality)) + tmpl := "- name: alpha\n cardinality:\n numerator: %d\n denominator: %d\n range:\n min: %d%s\n max: %d%s" + + yaml := []byte(fmt.Sprintf(tmpl, cardinalityNumerator, cardinalityDenominator, rangeMin, rangeTrailing, rangeMax, rangeTrailing)) cfg, err := config.LoadConfigFromYaml(yaml) if err != nil {