Allow generation of a subset of ECS and custom schema fields (#737)

Usage: python scripts/generator.py --subset my-field-whitelist.yml
elastic · Feb 13, 2020 · fc7ab4e · fc7ab4e
1 parent 40fb29b
commit fc7ab4e
Show file tree

Hide file tree

Showing 7 changed files with 284 additions and 7 deletions.
diff --git a/CHANGELOG.next.md b/CHANGELOG.next.md
@@ -38,6 +38,7 @@ Thanks, you're awesome :-) -->
 
 * ECS scripts now use Python 3.6+. #674
 * schema_reader.py now reliably supports chaining reusable fieldsets together. #722
+* Allow the artifact generator to consider and output only a subset of fields. #737
 * Add support for reusing fields in places other than the top level of the destination fieldset. #739
 
 #### Deprecated

diff --git a/scripts/generator.py b/scripts/generator.py
@@ -2,6 +2,7 @@
 import glob
 import os
 import schema_reader
+import yaml
 from generators import intermediate_files
 from generators import csv_generator
 from generators import es_template
@@ -18,20 +19,27 @@ def main():
 
     # Load the default schemas
     print('Loading default schemas')
-    (nested, flat) = schema_reader.load_schemas()
+    intermediate_fields = schema_reader.load_schemas()
 
     # Maybe load user specified directory of schemas
     if args.include:
         include_glob = os.path.join(args.include, '*.yml')
 
         print('Loading user defined schemas: {0}'.format(include_glob))
 
-        (custom_nested, custom_flat) = schema_reader.load_schemas(sorted(glob.glob(include_glob)))
+        intermediate_custom = schema_reader.load_schemas(sorted(glob.glob(include_glob)))
+        schema_reader.merge_schema_fields(intermediate_fields, intermediate_custom)
 
-        # Merge without allowing user schemas to overwrite default schemas
-        nested = ecs_helpers.safe_merge_dicts(nested, custom_nested)
-        flat = ecs_helpers.safe_merge_dicts(flat, custom_flat)
+    if args.subset:
+        subset = {}
+        for arg in args.subset:
+            for file in glob.glob(arg):
+                with open(file) as f:
+                    raw = yaml.safe_load(f.read())
+                    ecs_helpers.recursive_merge_subset_dicts(subset, raw)
+        intermediate_fields = ecs_helpers.fields_subset(subset, intermediate_fields)
 
+    (nested, flat) = schema_reader.generate_nested_flat(intermediate_fields)
     intermediate_files.generate(nested, flat)
     if args.intermediate_only:
         exit()
@@ -48,6 +56,8 @@ def argument_parser():
                         help='generate intermediary files only')
     parser.add_argument('--include', action='store',
                         help='include user specified directory of custom field definitions')
+    parser.add_argument('--subset', nargs='+',
+                        help='render a subset of the schema')
     return parser.parse_args()
 
 

diff --git a/scripts/generators/ecs_helpers.py b/scripts/generators/ecs_helpers.py
@@ -49,6 +49,29 @@ def safe_merge_dicts(a, b):
     return c
 
 
+def fields_subset(subset, fields):
+    retained_fields = {}
+    for key, val in subset.items():
+        # Every field must have a 'fields' key or the schema is invalid
+        if isinstance(val['fields'], dict):
+            # Copy the full field over so we get all the options, then replace the 'fields' with the right subset
+            retained_fields[key] = fields[key]
+            retained_fields[key]['fields'] = fields_subset(val['fields'], fields[key]['fields'])
+        elif val['fields'] == '*':
+            retained_fields[key] = fields[key]
+    return retained_fields
+
+
+def recursive_merge_subset_dicts(a, b):
+    for key in b:
+        if key not in a:
+            a[key] = b[key]
+        elif isinstance(a[key]['fields'], dict) and isinstance(b[key]['fields'], dict):
+            recursive_merge_subset_dicts(a[key]['fields'], b[key]['fields'])
+        elif b[key]['fields'] == "*":
+            a[key]['fields'] = b[key]['fields']
+
+
 def yaml_ordereddict(dumper, data):
     # YAML representation of an OrderedDict will be like a dictionary, but
     # respecting the order of the dictionary.

diff --git a/scripts/schema_reader.py b/scripts/schema_reader.py
@@ -92,6 +92,22 @@ def schema_fields_as_dictionary(schema):
         nested_schema[nested_levels[-1]]['field_details'] = field
 
 
+def merge_schema_fields(a, b):
+    for key in b:
+        if key not in a:
+            a[key] = b[key]
+        else:
+            a_type = a[key].get('field_details', {}).get('type', 'object')
+            b_type = b[key].get('field_details', {}).get('type', 'object')
+            if a_type != b_type:
+                raise ValueError('Schemas unmergeable: type {} does not match type {}'.format(a_type, b_type))
+            elif a_type not in ['object', 'nested']:
+                print('Warning: dropping field {}, already defined'.format(key))
+            elif 'fields' in b[key]:
+                a[key].setdefault('fields', {})
+                merge_schema_fields(a[key]['fields'], b[key]['fields'])
+
+
 def field_set_defaults(field):
     dict_set_default(field, 'normalize', [])
     if field['type'] == 'keyword':
@@ -157,6 +173,8 @@ def finalize_schemas(fields_nested):
 
         schema_cleanup_values(schema)
 
+
+def assemble_reusables(fields_nested):
     # This happens as a second pass, so that all fieldsets have their
     # fields array replaced with a fields dictionary.
     for schema_name in fields_nested:
@@ -224,6 +242,11 @@ def load_schemas(files=ecs_files()):
     """Loads the given list of files"""
     fields_intermediate = load_schema_files(files)
     finalize_schemas(fields_intermediate)
+    return fields_intermediate
+
+
+def generate_nested_flat(fields_intermediate):
+    assemble_reusables(fields_intermediate)
     cleanup_fields_recursive(fields_intermediate, "")
     fields_nested = generate_partially_flattened_fields(fields_intermediate)
     fields_flat = generate_fully_flattened_fields(fields_intermediate)

diff --git a/scripts/tests/test_ecs_helpers.py b/scripts/tests/test_ecs_helpers.py
@@ -84,6 +84,120 @@ def test_list_slit_by(self):
         split_list = ecs_helpers.list_split_by(lst, 3)
         self.assertEqual(split_list, [['ecs', 'has', 'a'], ['meme', 'now']])
 
+    def test_recursive_subset_merge(self):
+        subset_a = {
+            'field1': {
+                'fields': {
+                    'subfield1': {
+                        'fields': {
+                            'subsubfield1': {
+                                'fields': '*'
+                            }
+                        }
+                    },
+                    'subfield2': {
+                        'fields': '*'
+                    }
+                }
+            },
+            'field2': {
+                'fields': '*'
+            }
+        }
+        subset_b = {
+            'field1': {
+                'fields': {
+                    'subfield1': {
+                        'fields': '*'
+                    },
+                    'subfield3': {
+                        'fields': '*'
+                    }
+                }
+            },
+            'field2': {
+                'fields': {
+                    'subfield2': {
+                        'fields': '*'
+                    }
+                }
+            },
+            'field3': {
+                'fields': '*'
+            }
+        }
+        expected = {
+            'field1': {
+                'fields': {
+                    'subfield1': {
+                        'fields': '*'
+                    },
+                    'subfield2': {
+                        'fields': '*'
+                    },
+                    'subfield3': {
+                        'fields': '*'
+                    }
+                }
+            },
+            'field2': {
+                'fields': '*'
+            },
+            'field3': {
+                'fields': '*'
+            }
+        }
+        ecs_helpers.recursive_merge_subset_dicts(subset_a, subset_b)
+        self.assertEqual(subset_a, expected)
+
+    def test_fields_subset(self):
+        fields = {
+            'test_fieldset': {
+                'name': 'test_fieldset',
+                'fields': {
+                    'test_field1': {
+                        'field_details': {
+                            'name': 'test_field1',
+                            'type': 'keyword',
+                            'description': 'A test field'
+                        }
+                    },
+                    'test_field2': {
+                        'field_details': {
+                            'name': 'test_field2',
+                            'type': 'keyword',
+                            'description': 'Another test field'
+                        }
+                    }
+                }
+            }
+        }
+        subset = {
+            'test_fieldset': {
+                'fields': {
+                    'test_field1': {
+                        'fields': '*'
+                    }
+                }
+            }
+        }
+        expected = {
+            'test_fieldset': {
+                'name': 'test_fieldset',
+                'fields': {
+                    'test_field1': {
+                        'field_details': {
+                            'name': 'test_field1',
+                            'type': 'keyword',
+                            'description': 'A test field'
+                        }
+                    }
+                }
+            }
+        }
+        actual = ecs_helpers.fields_subset(subset, fields)
+        self.assertEqual(actual, expected)
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/scripts/tests/test_ecs_spec.py b/scripts/tests/test_ecs_spec.py
@@ -7,7 +7,7 @@
 from scripts import schema_reader
 
 
-(nested, flat) = schema_reader.load_schemas()
+(nested, flat) = schema_reader.generate_nested_flat(schema_reader.load_schemas())
 
 
 class TestEcsSpec(unittest.TestCase):

diff --git a/scripts/tests/test_schema_reader.py b/scripts/tests/test_schema_reader.py
@@ -82,7 +82,7 @@ def test_field_set_multi_field_defaults_missing_name(self):
 
     def test_load_schemas_with_empty_list_loads_nothing(self):
         result = schema_reader.load_schemas([])
-        self.assertEqual(result, ({}, {}))
+        self.assertEqual(result, ({}))
 
     def test_flatten_fields(self):
         fields = {
@@ -253,6 +253,112 @@ def test_cleanup_fields_recursive(self):
         }
         self.assertEqual(fields, expected)
 
+    def test_merge_schema_fields(self):
+        fieldset1 = {
+            'test_fieldset': {
+                'name': 'test_fieldset',
+                'fields': {
+                    'test_field1': {
+                        'field_details': {
+                            'name': 'test_field1',
+                            'type': 'keyword',
+                            'description': 'A test field'
+                        }
+                    },
+                    'test_field2': {
+                        'field_details': {
+                            'name': 'test_field2',
+                            'type': 'keyword',
+                            'description': 'Another test field'
+                        }
+                    }
+                }
+            }
+        }
+        fieldset2 = {
+            'test_fieldset': {
+                'name': 'test_fieldset',
+                'fields': {
+                    'test_field1': {
+                        'field_details': {
+                            'name': 'test_field1',
+                            'type': 'keyword',
+                            'description': 'A test field with matching type but custom description'
+                        }
+                    },
+                    'test_field3': {
+                        'field_details': {
+                            'name': 'test_field3',
+                            'type': 'keyword',
+                            'description': 'A third test field'
+                        }
+                    }
+                }
+            }
+        }
+        expected = {
+            'test_fieldset': {
+                'name': 'test_fieldset',
+                'fields': {
+                    'test_field1': {
+                        'field_details': {
+                            'name': 'test_field1',
+                            'type': 'keyword',
+                            'description': 'A test field'
+                        }
+                    },
+                    'test_field2': {
+                        'field_details': {
+                            'name': 'test_field2',
+                            'type': 'keyword',
+                            'description': 'Another test field'
+                        }
+                    },
+                    'test_field3': {
+                        'field_details': {
+                            'name': 'test_field3',
+                            'type': 'keyword',
+                            'description': 'A third test field'
+                        }
+                    }
+                }
+            }
+        }
+        schema_reader.merge_schema_fields(fieldset1, fieldset2)
+        self.assertEqual(fieldset1, expected)
+
+    def test_merge_schema_fields_fail(self):
+        fieldset1 = {
+            'test_fieldset': {
+                'name': 'test_fieldset',
+                'fields': {
+                    'test_field1': {
+                        'field_details': {
+                            'name': 'test_field1',
+                            'type': 'keyword',
+                            'description': 'A test field'
+                        }
+                    }
+                }
+            }
+        }
+        fieldset2 = {
+            'test_fieldset': {
+                'name': 'test_fieldset',
+                'fields': {
+                    'test_field1': {
+                        'field_details': {
+                            'name': 'test_field1',
+                            'type': 'long',
+                            'description': 'A conflicting field'
+                        }
+                    }
+                }
+            }
+        }
+        with self.assertRaises(ValueError):
+            schema_reader.merge_schema_fields(fieldset1, fieldset2)
+
     def test_reusable_dot_notation(self):
         fieldset = {
             'reusable_fieldset1': {