Skip to content

Commit

Permalink
Allow generation of a subset of ECS and custom schema fields (#737)
Browse files Browse the repository at this point in the history
Usage:

python scripts/generator.py --subset my-field-whitelist.yml
  • Loading branch information
marshallmain authored Feb 13, 2020
1 parent 40fb29b commit fc7ab4e
Show file tree
Hide file tree
Showing 7 changed files with 284 additions and 7 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.next.md
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ Thanks, you're awesome :-) -->

* ECS scripts now use Python 3.6+. #674
* schema_reader.py now reliably supports chaining reusable fieldsets together. #722
* Allow the artifact generator to consider and output only a subset of fields. #737
* Add support for reusing fields in places other than the top level of the destination fieldset. #739

#### Deprecated
Expand Down
20 changes: 15 additions & 5 deletions scripts/generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import glob
import os
import schema_reader
import yaml
from generators import intermediate_files
from generators import csv_generator
from generators import es_template
Expand All @@ -18,20 +19,27 @@ def main():

# Load the default schemas
print('Loading default schemas')
(nested, flat) = schema_reader.load_schemas()
intermediate_fields = schema_reader.load_schemas()

# Maybe load user specified directory of schemas
if args.include:
include_glob = os.path.join(args.include, '*.yml')

print('Loading user defined schemas: {0}'.format(include_glob))

(custom_nested, custom_flat) = schema_reader.load_schemas(sorted(glob.glob(include_glob)))
intermediate_custom = schema_reader.load_schemas(sorted(glob.glob(include_glob)))
schema_reader.merge_schema_fields(intermediate_fields, intermediate_custom)

# Merge without allowing user schemas to overwrite default schemas
nested = ecs_helpers.safe_merge_dicts(nested, custom_nested)
flat = ecs_helpers.safe_merge_dicts(flat, custom_flat)
if args.subset:
subset = {}
for arg in args.subset:
for file in glob.glob(arg):
with open(file) as f:
raw = yaml.safe_load(f.read())
ecs_helpers.recursive_merge_subset_dicts(subset, raw)
intermediate_fields = ecs_helpers.fields_subset(subset, intermediate_fields)

(nested, flat) = schema_reader.generate_nested_flat(intermediate_fields)
intermediate_files.generate(nested, flat)
if args.intermediate_only:
exit()
Expand All @@ -48,6 +56,8 @@ def argument_parser():
help='generate intermediary files only')
parser.add_argument('--include', action='store',
help='include user specified directory of custom field definitions')
parser.add_argument('--subset', nargs='+',
help='render a subset of the schema')
return parser.parse_args()


Expand Down
23 changes: 23 additions & 0 deletions scripts/generators/ecs_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,29 @@ def safe_merge_dicts(a, b):
return c


def fields_subset(subset, fields):
retained_fields = {}
for key, val in subset.items():
# Every field must have a 'fields' key or the schema is invalid
if isinstance(val['fields'], dict):
# Copy the full field over so we get all the options, then replace the 'fields' with the right subset
retained_fields[key] = fields[key]
retained_fields[key]['fields'] = fields_subset(val['fields'], fields[key]['fields'])
elif val['fields'] == '*':
retained_fields[key] = fields[key]
return retained_fields


def recursive_merge_subset_dicts(a, b):
for key in b:
if key not in a:
a[key] = b[key]
elif isinstance(a[key]['fields'], dict) and isinstance(b[key]['fields'], dict):
recursive_merge_subset_dicts(a[key]['fields'], b[key]['fields'])
elif b[key]['fields'] == "*":
a[key]['fields'] = b[key]['fields']


def yaml_ordereddict(dumper, data):
# YAML representation of an OrderedDict will be like a dictionary, but
# respecting the order of the dictionary.
Expand Down
23 changes: 23 additions & 0 deletions scripts/schema_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,22 @@ def schema_fields_as_dictionary(schema):
nested_schema[nested_levels[-1]]['field_details'] = field


def merge_schema_fields(a, b):
for key in b:
if key not in a:
a[key] = b[key]
else:
a_type = a[key].get('field_details', {}).get('type', 'object')
b_type = b[key].get('field_details', {}).get('type', 'object')
if a_type != b_type:
raise ValueError('Schemas unmergeable: type {} does not match type {}'.format(a_type, b_type))
elif a_type not in ['object', 'nested']:
print('Warning: dropping field {}, already defined'.format(key))
elif 'fields' in b[key]:
a[key].setdefault('fields', {})
merge_schema_fields(a[key]['fields'], b[key]['fields'])


def field_set_defaults(field):
dict_set_default(field, 'normalize', [])
if field['type'] == 'keyword':
Expand Down Expand Up @@ -157,6 +173,8 @@ def finalize_schemas(fields_nested):

schema_cleanup_values(schema)


def assemble_reusables(fields_nested):
# This happens as a second pass, so that all fieldsets have their
# fields array replaced with a fields dictionary.
for schema_name in fields_nested:
Expand Down Expand Up @@ -224,6 +242,11 @@ def load_schemas(files=ecs_files()):
"""Loads the given list of files"""
fields_intermediate = load_schema_files(files)
finalize_schemas(fields_intermediate)
return fields_intermediate


def generate_nested_flat(fields_intermediate):
assemble_reusables(fields_intermediate)
cleanup_fields_recursive(fields_intermediate, "")
fields_nested = generate_partially_flattened_fields(fields_intermediate)
fields_flat = generate_fully_flattened_fields(fields_intermediate)
Expand Down
114 changes: 114 additions & 0 deletions scripts/tests/test_ecs_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,120 @@ def test_list_slit_by(self):
split_list = ecs_helpers.list_split_by(lst, 3)
self.assertEqual(split_list, [['ecs', 'has', 'a'], ['meme', 'now']])

def test_recursive_subset_merge(self):
subset_a = {
'field1': {
'fields': {
'subfield1': {
'fields': {
'subsubfield1': {
'fields': '*'
}
}
},
'subfield2': {
'fields': '*'
}
}
},
'field2': {
'fields': '*'
}
}
subset_b = {
'field1': {
'fields': {
'subfield1': {
'fields': '*'
},
'subfield3': {
'fields': '*'
}
}
},
'field2': {
'fields': {
'subfield2': {
'fields': '*'
}
}
},
'field3': {
'fields': '*'
}
}
expected = {
'field1': {
'fields': {
'subfield1': {
'fields': '*'
},
'subfield2': {
'fields': '*'
},
'subfield3': {
'fields': '*'
}
}
},
'field2': {
'fields': '*'
},
'field3': {
'fields': '*'
}
}
ecs_helpers.recursive_merge_subset_dicts(subset_a, subset_b)
self.assertEqual(subset_a, expected)

def test_fields_subset(self):
fields = {
'test_fieldset': {
'name': 'test_fieldset',
'fields': {
'test_field1': {
'field_details': {
'name': 'test_field1',
'type': 'keyword',
'description': 'A test field'
}
},
'test_field2': {
'field_details': {
'name': 'test_field2',
'type': 'keyword',
'description': 'Another test field'
}
}
}
}
}
subset = {
'test_fieldset': {
'fields': {
'test_field1': {
'fields': '*'
}
}
}
}
expected = {
'test_fieldset': {
'name': 'test_fieldset',
'fields': {
'test_field1': {
'field_details': {
'name': 'test_field1',
'type': 'keyword',
'description': 'A test field'
}
}
}
}
}
actual = ecs_helpers.fields_subset(subset, fields)
self.assertEqual(actual, expected)


if __name__ == '__main__':
unittest.main()
2 changes: 1 addition & 1 deletion scripts/tests/test_ecs_spec.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from scripts import schema_reader


(nested, flat) = schema_reader.load_schemas()
(nested, flat) = schema_reader.generate_nested_flat(schema_reader.load_schemas())


class TestEcsSpec(unittest.TestCase):
Expand Down
108 changes: 107 additions & 1 deletion scripts/tests/test_schema_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ def test_field_set_multi_field_defaults_missing_name(self):

def test_load_schemas_with_empty_list_loads_nothing(self):
result = schema_reader.load_schemas([])
self.assertEqual(result, ({}, {}))
self.assertEqual(result, ({}))

def test_flatten_fields(self):
fields = {
Expand Down Expand Up @@ -253,6 +253,112 @@ def test_cleanup_fields_recursive(self):
}
self.assertEqual(fields, expected)

def test_merge_schema_fields(self):
fieldset1 = {
'test_fieldset': {
'name': 'test_fieldset',
'fields': {
'test_field1': {
'field_details': {
'name': 'test_field1',
'type': 'keyword',
'description': 'A test field'
}
},
'test_field2': {
'field_details': {
'name': 'test_field2',
'type': 'keyword',
'description': 'Another test field'
}
}
}
}
}
fieldset2 = {
'test_fieldset': {
'name': 'test_fieldset',
'fields': {
'test_field1': {
'field_details': {
'name': 'test_field1',
'type': 'keyword',
'description': 'A test field with matching type but custom description'
}
},
'test_field3': {
'field_details': {
'name': 'test_field3',
'type': 'keyword',
'description': 'A third test field'
}
}
}
}
}
expected = {
'test_fieldset': {
'name': 'test_fieldset',
'fields': {
'test_field1': {
'field_details': {
'name': 'test_field1',
'type': 'keyword',
'description': 'A test field'
}
},
'test_field2': {
'field_details': {
'name': 'test_field2',
'type': 'keyword',
'description': 'Another test field'
}
},
'test_field3': {
'field_details': {
'name': 'test_field3',
'type': 'keyword',
'description': 'A third test field'
}
}
}
}
}
schema_reader.merge_schema_fields(fieldset1, fieldset2)
self.assertEqual(fieldset1, expected)

def test_merge_schema_fields_fail(self):
fieldset1 = {
'test_fieldset': {
'name': 'test_fieldset',
'fields': {
'test_field1': {
'field_details': {
'name': 'test_field1',
'type': 'keyword',
'description': 'A test field'
}
}
}
}
}
fieldset2 = {
'test_fieldset': {
'name': 'test_fieldset',
'fields': {
'test_field1': {
'field_details': {
'name': 'test_field1',
'type': 'long',
'description': 'A conflicting field'
}
}
}
}
}
with self.assertRaises(ValueError):
schema_reader.merge_schema_fields(fieldset1, fieldset2)

def test_reusable_dot_notation(self):
fieldset = {
'reusable_fieldset1': {
Expand Down

0 comments on commit fc7ab4e

Please sign in to comment.