diff --git a/CHANGELOG.next.md b/CHANGELOG.next.md index ef3e354452..ffd18264f7 100644 --- a/CHANGELOG.next.md +++ b/CHANGELOG.next.md @@ -38,6 +38,7 @@ Thanks, you're awesome :-) --> * ECS scripts now use Python 3.6+. #674 * schema_reader.py now reliably supports chaining reusable fieldsets together. #722 +* Allow the artifact generator to consider and output only a subset of fields. #737 * Add support for reusing fields in places other than the top level of the destination fieldset. #739 #### Deprecated diff --git a/scripts/generator.py b/scripts/generator.py index 0bf57851b9..6afeb5fc9d 100644 --- a/scripts/generator.py +++ b/scripts/generator.py @@ -2,6 +2,7 @@ import glob import os import schema_reader +import yaml from generators import intermediate_files from generators import csv_generator from generators import es_template @@ -18,7 +19,7 @@ def main(): # Load the default schemas print('Loading default schemas') - (nested, flat) = schema_reader.load_schemas() + intermediate_fields = schema_reader.load_schemas() # Maybe load user specified directory of schemas if args.include: @@ -26,12 +27,19 @@ def main(): print('Loading user defined schemas: {0}'.format(include_glob)) - (custom_nested, custom_flat) = schema_reader.load_schemas(sorted(glob.glob(include_glob))) + intermediate_custom = schema_reader.load_schemas(sorted(glob.glob(include_glob))) + schema_reader.merge_schema_fields(intermediate_fields, intermediate_custom) - # Merge without allowing user schemas to overwrite default schemas - nested = ecs_helpers.safe_merge_dicts(nested, custom_nested) - flat = ecs_helpers.safe_merge_dicts(flat, custom_flat) + if args.subset: + subset = {} + for arg in args.subset: + for file in glob.glob(arg): + with open(file) as f: + raw = yaml.safe_load(f.read()) + ecs_helpers.recursive_merge_subset_dicts(subset, raw) + intermediate_fields = ecs_helpers.fields_subset(subset, intermediate_fields) + (nested, flat) = schema_reader.generate_nested_flat(intermediate_fields) intermediate_files.generate(nested, flat) if args.intermediate_only: exit() @@ -48,6 +56,8 @@ def argument_parser(): help='generate intermediary files only') parser.add_argument('--include', action='store', help='include user specified directory of custom field definitions') + parser.add_argument('--subset', nargs='+', + help='render a subset of the schema') return parser.parse_args() diff --git a/scripts/generators/ecs_helpers.py b/scripts/generators/ecs_helpers.py index aff9965b8f..42d02aebc4 100644 --- a/scripts/generators/ecs_helpers.py +++ b/scripts/generators/ecs_helpers.py @@ -49,6 +49,29 @@ def safe_merge_dicts(a, b): return c +def fields_subset(subset, fields): + retained_fields = {} + for key, val in subset.items(): + # Every field must have a 'fields' key or the schema is invalid + if isinstance(val['fields'], dict): + # Copy the full field over so we get all the options, then replace the 'fields' with the right subset + retained_fields[key] = fields[key] + retained_fields[key]['fields'] = fields_subset(val['fields'], fields[key]['fields']) + elif val['fields'] == '*': + retained_fields[key] = fields[key] + return retained_fields + + +def recursive_merge_subset_dicts(a, b): + for key in b: + if key not in a: + a[key] = b[key] + elif isinstance(a[key]['fields'], dict) and isinstance(b[key]['fields'], dict): + recursive_merge_subset_dicts(a[key]['fields'], b[key]['fields']) + elif b[key]['fields'] == "*": + a[key]['fields'] = b[key]['fields'] + + def yaml_ordereddict(dumper, data): # YAML representation of an OrderedDict will be like a dictionary, but # respecting the order of the dictionary. diff --git a/scripts/schema_reader.py b/scripts/schema_reader.py index 9c0c218dbd..5e8a7c8e6e 100644 --- a/scripts/schema_reader.py +++ b/scripts/schema_reader.py @@ -92,6 +92,22 @@ def schema_fields_as_dictionary(schema): nested_schema[nested_levels[-1]]['field_details'] = field +def merge_schema_fields(a, b): + for key in b: + if key not in a: + a[key] = b[key] + else: + a_type = a[key].get('field_details', {}).get('type', 'object') + b_type = b[key].get('field_details', {}).get('type', 'object') + if a_type != b_type: + raise ValueError('Schemas unmergeable: type {} does not match type {}'.format(a_type, b_type)) + elif a_type not in ['object', 'nested']: + print('Warning: dropping field {}, already defined'.format(key)) + elif 'fields' in b[key]: + a[key].setdefault('fields', {}) + merge_schema_fields(a[key]['fields'], b[key]['fields']) + + def field_set_defaults(field): dict_set_default(field, 'normalize', []) if field['type'] == 'keyword': @@ -157,6 +173,8 @@ def finalize_schemas(fields_nested): schema_cleanup_values(schema) + +def assemble_reusables(fields_nested): # This happens as a second pass, so that all fieldsets have their # fields array replaced with a fields dictionary. for schema_name in fields_nested: @@ -224,6 +242,11 @@ def load_schemas(files=ecs_files()): """Loads the given list of files""" fields_intermediate = load_schema_files(files) finalize_schemas(fields_intermediate) + return fields_intermediate + + +def generate_nested_flat(fields_intermediate): + assemble_reusables(fields_intermediate) cleanup_fields_recursive(fields_intermediate, "") fields_nested = generate_partially_flattened_fields(fields_intermediate) fields_flat = generate_fully_flattened_fields(fields_intermediate) diff --git a/scripts/tests/test_ecs_helpers.py b/scripts/tests/test_ecs_helpers.py index 5a7788f100..2c58c3a09f 100644 --- a/scripts/tests/test_ecs_helpers.py +++ b/scripts/tests/test_ecs_helpers.py @@ -84,6 +84,120 @@ def test_list_slit_by(self): split_list = ecs_helpers.list_split_by(lst, 3) self.assertEqual(split_list, [['ecs', 'has', 'a'], ['meme', 'now']]) + def test_recursive_subset_merge(self): + subset_a = { + 'field1': { + 'fields': { + 'subfield1': { + 'fields': { + 'subsubfield1': { + 'fields': '*' + } + } + }, + 'subfield2': { + 'fields': '*' + } + } + }, + 'field2': { + 'fields': '*' + } + } + subset_b = { + 'field1': { + 'fields': { + 'subfield1': { + 'fields': '*' + }, + 'subfield3': { + 'fields': '*' + } + } + }, + 'field2': { + 'fields': { + 'subfield2': { + 'fields': '*' + } + } + }, + 'field3': { + 'fields': '*' + } + } + expected = { + 'field1': { + 'fields': { + 'subfield1': { + 'fields': '*' + }, + 'subfield2': { + 'fields': '*' + }, + 'subfield3': { + 'fields': '*' + } + } + }, + 'field2': { + 'fields': '*' + }, + 'field3': { + 'fields': '*' + } + } + ecs_helpers.recursive_merge_subset_dicts(subset_a, subset_b) + self.assertEqual(subset_a, expected) + + def test_fields_subset(self): + fields = { + 'test_fieldset': { + 'name': 'test_fieldset', + 'fields': { + 'test_field1': { + 'field_details': { + 'name': 'test_field1', + 'type': 'keyword', + 'description': 'A test field' + } + }, + 'test_field2': { + 'field_details': { + 'name': 'test_field2', + 'type': 'keyword', + 'description': 'Another test field' + } + } + } + } + } + subset = { + 'test_fieldset': { + 'fields': { + 'test_field1': { + 'fields': '*' + } + } + } + } + expected = { + 'test_fieldset': { + 'name': 'test_fieldset', + 'fields': { + 'test_field1': { + 'field_details': { + 'name': 'test_field1', + 'type': 'keyword', + 'description': 'A test field' + } + } + } + } + } + actual = ecs_helpers.fields_subset(subset, fields) + self.assertEqual(actual, expected) + if __name__ == '__main__': unittest.main() diff --git a/scripts/tests/test_ecs_spec.py b/scripts/tests/test_ecs_spec.py index 40d349eaaf..b18ce92c21 100644 --- a/scripts/tests/test_ecs_spec.py +++ b/scripts/tests/test_ecs_spec.py @@ -7,7 +7,7 @@ from scripts import schema_reader -(nested, flat) = schema_reader.load_schemas() +(nested, flat) = schema_reader.generate_nested_flat(schema_reader.load_schemas()) class TestEcsSpec(unittest.TestCase): diff --git a/scripts/tests/test_schema_reader.py b/scripts/tests/test_schema_reader.py index 6b6be4c583..7cc6e2729f 100644 --- a/scripts/tests/test_schema_reader.py +++ b/scripts/tests/test_schema_reader.py @@ -82,7 +82,7 @@ def test_field_set_multi_field_defaults_missing_name(self): def test_load_schemas_with_empty_list_loads_nothing(self): result = schema_reader.load_schemas([]) - self.assertEqual(result, ({}, {})) + self.assertEqual(result, ({})) def test_flatten_fields(self): fields = { @@ -253,6 +253,112 @@ def test_cleanup_fields_recursive(self): } self.assertEqual(fields, expected) + def test_merge_schema_fields(self): + fieldset1 = { + 'test_fieldset': { + 'name': 'test_fieldset', + 'fields': { + 'test_field1': { + 'field_details': { + 'name': 'test_field1', + 'type': 'keyword', + 'description': 'A test field' + } + }, + 'test_field2': { + 'field_details': { + 'name': 'test_field2', + 'type': 'keyword', + 'description': 'Another test field' + } + } + } + } + } + fieldset2 = { + 'test_fieldset': { + 'name': 'test_fieldset', + 'fields': { + 'test_field1': { + 'field_details': { + 'name': 'test_field1', + 'type': 'keyword', + 'description': 'A test field with matching type but custom description' + } + }, + 'test_field3': { + 'field_details': { + 'name': 'test_field3', + 'type': 'keyword', + 'description': 'A third test field' + } + } + } + } + } + expected = { + 'test_fieldset': { + 'name': 'test_fieldset', + 'fields': { + 'test_field1': { + 'field_details': { + 'name': 'test_field1', + 'type': 'keyword', + 'description': 'A test field' + } + }, + 'test_field2': { + 'field_details': { + 'name': 'test_field2', + 'type': 'keyword', + 'description': 'Another test field' + } + }, + 'test_field3': { + 'field_details': { + 'name': 'test_field3', + 'type': 'keyword', + 'description': 'A third test field' + } + } + } + } + } + schema_reader.merge_schema_fields(fieldset1, fieldset2) + self.assertEqual(fieldset1, expected) + + def test_merge_schema_fields_fail(self): + fieldset1 = { + 'test_fieldset': { + 'name': 'test_fieldset', + 'fields': { + 'test_field1': { + 'field_details': { + 'name': 'test_field1', + 'type': 'keyword', + 'description': 'A test field' + } + } + } + } + } + fieldset2 = { + 'test_fieldset': { + 'name': 'test_fieldset', + 'fields': { + 'test_field1': { + 'field_details': { + 'name': 'test_field1', + 'type': 'long', + 'description': 'A conflicting field' + } + } + } + } + } + with self.assertRaises(ValueError): + schema_reader.merge_schema_fields(fieldset1, fieldset2) + def test_reusable_dot_notation(self): fieldset = { 'reusable_fieldset1': {