Skip to content

Commit

Permalink
[Tooling] Add --exclude flag to Generator to support field removal te…
Browse files Browse the repository at this point in the history
…sting (#1411) (#1431)

* add --exclude flag to Generator
  • Loading branch information
djptek authored May 26, 2021
1 parent 97ce65c commit a00eb89
Show file tree
Hide file tree
Showing 17 changed files with 308 additions and 79 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.next.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ Thanks, you're awesome :-) -->
#### Improvements

* Fix ecs GitHub repo link source branch #1393
* Add --exclude flag to Generator to support field removal testing #1411

#### Deprecated

Expand Down
36 changes: 36 additions & 0 deletions USAGE.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ relevant artifacts for their unique set of data sources.
* [Generator Options](#generator-options)
+ [Out](#out)
+ [Include](#include)
+ [Exclude](#exclude)
+ [Subset](#subset)
+ [Ref](#ref)
+ [Mapping & Template Settings](#mapping--template-settings)
Expand Down Expand Up @@ -192,6 +193,41 @@ Include can be used together with the `--ref` flag to merge custom fields into a

> NOTE: The `--include` mechanism will not validate custom YAML files prior to merging. This allows for modifying existing ECS fields in a custom schema without having to redefine all the mandatory field attributes.
#### Exclude

Use the `--exclude` flag to generate ephemeral ECS artifacts based on the current ECS schema field definitions minus fields considered for removal, e.g. to assess impact of removing these. Warning! This is not the recommended route to remove a field permanently as it is not intentended to be invoked during the build process. Definitive field removal should be implemented using a custom [Subset](#subset) or via the [RFC process](https://github.com/elastic/ecs/tree/master/rfcs/README.md). Example:

```
$ python scripts/generator.py --exclude=../my-project/my-exclude-file.yml
$ python scripts/generator.py --exclude="../my-project/schemas/a*.yml"
```

The `--exclude` flag expects a path to one or more YAML files using the same [file format](https://github.com/elastic/ecs/tree/master/schemas#fields-supported-in-schemasyml) as the ECS schema files. You can also use a subset, provided that relevant `name` and `fields` fields are preserved.

```
---
- name: log
fields:
- name: original
```

The root Field Set `name` must always be present and specified with no dots `.`. Subfields may be specified using dot notation, for example:

```
---
- name: log
fields:
- name: syslog.severity.name
```

Generate artifacts using `--exclude` to load our custom definitions in addition to `--out` to place them in the desired output directory:

```
$ python scripts/generator.py --exclude ../myproject/exclude-set.yml/ --out ../myproject/out/
Loading schemas from local files
Running generator. ECS version 1.11.0
```

#### Subset

If your indices will never populate particular ECS fields, there's no need to include those field definitions in your index mappings. The `--subset` argument allows for passing a subset definition YAML file which indicates which field sets or specific fields to include in the generated artifacts.
Expand Down
6 changes: 5 additions & 1 deletion scripts/generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
from schema import cleaner
from schema import finalizer
from schema import subset_filter
from schema import exclude_filter


def main():
Expand Down Expand Up @@ -52,6 +53,7 @@ def main():
cleaner.clean(fields, strict=args.strict)
finalizer.finalize(fields)
fields = subset_filter.filter(fields, args.subset, out_dir)
fields = exclude_filter.exclude(fields, args.exclude)
nested, flat = intermediate_files.generate(fields, os.path.join(out_dir, 'ecs'), default_dirs)

if args.intermediate_only:
Expand All @@ -61,7 +63,7 @@ def main():
es_template.generate(nested, ecs_generated_version, out_dir, args.mapping_settings)
es_template.generate_legacy(flat, ecs_generated_version, out_dir, args.template_settings, args.mapping_settings)
beats.generate(nested, ecs_generated_version, out_dir)
if args.include or args.subset:
if args.include or args.subset or args.exclude:
exit()

ecs_helpers.make_dirs(docs_dir)
Expand All @@ -74,6 +76,8 @@ def argument_parser():
Note that "--include experimental/schemas" will also respect this git ref.')
parser.add_argument('--include', nargs='+',
help='include user specified directory of custom field definitions')
parser.add_argument('--exclude', nargs='+',
help='exclude user specified subset of the schema')
parser.add_argument('--subset', nargs='+',
help='render a subset of the schema')
parser.add_argument('--out', action='store', help='directory to output the generated files')
Expand Down
4 changes: 2 additions & 2 deletions scripts/generators/beats.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,9 +83,9 @@ def write_beats_yaml(beats_file, ecs_version, out_dir):


def file_header():
return '''
return """
# WARNING! Do not edit this file directly, it was generated by the ECS project,
# based on ECS version {version}.
# Please visit https://github.com/elastic/ecs to suggest changes to ECS fields.
'''.lstrip()
""".lstrip()
4 changes: 2 additions & 2 deletions scripts/generators/ecs_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -159,7 +159,7 @@ def yaml_load(filename):


def list_subtract(original, subtracted):
'''Subtract two lists. original = subtracted'''
"""Subtract two lists. original = subtracted"""
return [item for item in original if item not in subtracted]


Expand All @@ -175,7 +175,7 @@ def list_extract_keys(lst, key_name):


def is_intermediate(field):
'''Encapsulates the check to see if a field is an intermediate field or a "real" field.'''
"""Encapsulates the check to see if a field is an intermediate field or a "real" field."""
return ('intermediate' in field['field_details'] and field['field_details']['intermediate'])


Expand Down
4 changes: 2 additions & 2 deletions scripts/generators/es_template.py
Original file line number Diff line number Diff line change
Expand Up @@ -263,13 +263,13 @@ def default_mapping_settings():


def es6_type_fallback(mappings):
'''
"""
Visits each leaf in mappings object and fallback to an
Elasticsearch 6.x supported type.
Since a field like `wildcard` won't have the same defaults as
a `keyword` field, we must add any missing defaults.
'''
"""

for (name, details) in mappings.items():
if 'type' in details:
Expand Down
12 changes: 6 additions & 6 deletions scripts/generators/intermediate_files.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,15 +20,15 @@ def generate(fields, out_dir, default_dirs):


def generate_flat_fields(fields):
'''Generate ecs_flat.yml'''
"""Generate ecs_flat.yml"""
filtered = remove_non_root_reusables(fields)
flattened = {}
visitor.visit_fields_with_memo(filtered, accumulate_field, flattened)
return flattened


def accumulate_field(details, memo):
'''Visitor function that accumulates all field details in the memo dict'''
"""Visitor function that accumulates all field details in the memo dict"""
if 'schema_details' in details or ecs_helpers.is_intermediate(details):
return
field_details = copy.deepcopy(details['field_details'])
Expand All @@ -39,7 +39,7 @@ def accumulate_field(details, memo):


def generate_nested_fields(fields):
'''Generate ecs_nested.yml'''
"""Generate ecs_nested.yml"""
nested = {}
# Flatten each field set, but keep all resulting fields nested under their
# parent/host field set.
Expand Down Expand Up @@ -71,13 +71,13 @@ def generate_nested_fields(fields):


def remove_internal_attributes(field_details):
'''Remove attributes only relevant to the deeply nested structure, but not to ecs_flat/nested.yml.'''
"""Remove attributes only relevant to the deeply nested structure, but not to ecs_flat/nested.yml."""
field_details.pop('node_name', None)
field_details.pop('intermediate', None)


def remove_non_root_reusables(fields_nested):
'''
"""
Remove field sets that have top_level=false from the root of the field definitions.
This attribute means they're only meant to be in the "reusable/expected" locations
Expand All @@ -87,7 +87,7 @@ def remove_non_root_reusables(fields_nested):
still needs to keep all field sets at the root of the YAML file, as it
the official information about each field set. It's the responsibility of
users consuming ecs_nested.yml to skip the field sets with top_level=false.
'''
"""
fields = {}
for (name, field) in fields_nested.items():
if 'reusable' not in field['schema_details'] or field['schema_details']['reusable']['top_level']:
Expand Down
10 changes: 5 additions & 5 deletions scripts/schema/cleaner.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ def schema_cleanup(schema):


def schema_mandatory_attributes(schema):
'''Ensures for the presence of the mandatory schema attributes and raises if any are missing'''
"""Ensures for the presence of the mandatory schema attributes and raises if any are missing"""
current_schema_attributes = sorted(list(schema['field_details'].keys()) +
list(schema['schema_details'].keys()))
missing_attributes = ecs_helpers.list_subtract(SCHEMA_MANDATORY_ATTRIBUTES, current_schema_attributes)
Expand All @@ -74,7 +74,7 @@ def schema_mandatory_attributes(schema):


def schema_assertions_and_warnings(schema):
'''Additional checks on a fleshed out schema'''
"""Additional checks on a fleshed out schema"""
single_line_short_description(schema, strict=strict_mode)
if 'beta' in schema['field_details']:
single_line_beta_description(schema, strict=strict_mode)
Expand Down Expand Up @@ -143,7 +143,7 @@ def field_defaults(field):


def field_or_multi_field_datatype_defaults(field_details):
'''Sets datatype-related defaults on a canonical field or multi-field entries.'''
"""Sets datatype-related defaults on a canonical field or multi-field entries."""
if field_details['type'] == 'keyword':
field_details.setdefault('ignore_above', 1024)
if field_details['type'] == 'text':
Expand All @@ -160,7 +160,7 @@ def field_or_multi_field_datatype_defaults(field_details):


def field_mandatory_attributes(field):
'''Ensures for the presence of the mandatory field attributes and raises if any are missing'''
"""Ensures for the presence of the mandatory field attributes and raises if any are missing"""
if ecs_helpers.is_intermediate(field):
return
current_field_attributes = sorted(field['field_details'].keys())
Expand All @@ -180,7 +180,7 @@ def field_mandatory_attributes(field):


def field_assertions_and_warnings(field):
'''Additional checks on a fleshed out field'''
"""Additional checks on a fleshed out field"""
if not ecs_helpers.is_intermediate(field):
# check short description length if in strict mode
single_line_short_description(field, strict=strict_mode)
Expand Down
78 changes: 78 additions & 0 deletions scripts/schema/exclude_filter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
from schema import loader

# This script should be run downstream of the subset filters - it takes
# all ECS and custom fields already loaded by the latter and explicitly
# removes a subset, for example, to simulate impact of future removals


def exclude(fields, exclude_file_globs):
excludes = load_exclude_definitions(exclude_file_globs)

if excludes:
fields = exclude_fields(fields, excludes)

return fields


def long_path(path_as_list):
return '.'.join([e for e in path_as_list])


def pop_field(fields, node_path, path, removed):
"""pops a field from yaml derived dict using path derived from ordered list of nodes"""
if node_path[0] in fields:
if len(node_path) == 1:
flat_name = long_path(path)
fields.pop(node_path[0])
return flat_name
else:
inner_field = node_path.pop(0)
if 'fields' in fields[inner_field]:
popped = pop_field(fields[inner_field]['fields'], node_path, path, removed)
# if object field with no remaining fields and not 'base', pop it
if fields[inner_field]['fields'] == {} and inner_field != 'base':
fields.pop(inner_field)
return popped
else:
raise ValueError(
'--exclude specified, but no path to field {} found'.format(long_path(path)))
else:
this_long_path = long_path(path)
# Check in case already removed parent
if not any([this_long_path.startswith(long_path) for long_path in removed if long_path != None]):
raise ValueError('--exclude specified, but no field {} found'.format(this_long_path))


def exclude_trace_path(fields, item, path, removed):
"""traverses paths to one or more nodes in a yaml derived dict"""
for list_item in item:
node_path = path.copy()
# cater for name.with.dots
for name in list_item['name'].split('.'):
node_path.append(name)
if not 'fields' in list_item:
parent = node_path[0]
removed.append(pop_field(fields, node_path, node_path.copy(), removed))
# if parent field has no remaining fields and not 'base', pop it
if parent != 'base' and parent in fields and len(fields[parent]['fields']) == 0:
fields.pop(parent)
else:
raise ValueError('--exclude specified, can\'t parse fields in file {}'.format(item))


def exclude_fields(fields, excludes):
"""Traverses fields and eliminates any field which matches the excludes"""
if excludes:
for ex_list in excludes:
for item in ex_list:
exclude_trace_path(fields, item['fields'], [item['name']], [])
return fields


def load_exclude_definitions(file_globs):
if not file_globs:
return []
excludes = loader.load_definitions(file_globs)
if not excludes:
raise ValueError('--exclude specified, but no exclusions found in {}'.format(file_globs))
return excludes
20 changes: 10 additions & 10 deletions scripts/schema/finalizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@


def finalize(fields):
'''Intended entrypoint of the finalizer.'''
"""Intended entrypoint of the finalizer."""
perform_reuse(fields)
calculate_final_values(fields)

Expand All @@ -46,7 +46,7 @@ def order_reuses(fields):


def perform_reuse(fields):
'''Performs field reuse in two phases'''
"""Performs field reuse in two phases"""
foreign_reuses, self_nestings = order_reuses(fields)

# Phase 1: foreign reuse
Expand Down Expand Up @@ -99,11 +99,11 @@ def perform_reuse(fields):


def ensure_valid_reuse(reused_schema, destination_schema=None):
'''
"""
Raise if either the reused schema or destination schema have root=true.
Second param is optional, if testing for a self-nesting (where source=destination).
'''
"""
if reused_schema['schema_details']['root']:
msg = "Schema {} has attribute root=true and therefore cannot be reused.".format(
reused_schema['field_details']['name'])
Expand All @@ -115,7 +115,7 @@ def ensure_valid_reuse(reused_schema, destination_schema=None):


def append_reused_here(reused_schema, reuse_entry, destination_schema):
'''Captures two ways of denoting what field sets are reused under a given field set'''
"""Captures two ways of denoting what field sets are reused under a given field set"""
# Legacy, too limited
destination_schema['schema_details'].setdefault('nestings', [])
destination_schema['schema_details']['nestings'] = sorted(
Expand All @@ -136,15 +136,15 @@ def append_reused_here(reused_schema, reuse_entry, destination_schema):


def set_original_fieldset(fields, original_fieldset):
'''Recursively set the 'original_fieldset' attribute for all fields in a group of fields'''
"""Recursively set the 'original_fieldset' attribute for all fields in a group of fields"""
def func(details):
# Don't override if already set (e.g. 'group' for user.group.* fields)
details['field_details'].setdefault('original_fieldset', original_fieldset)
visitor.visit_fields(fields, field_func=func)


def field_group_at_path(dotted_path, fields):
'''Returns the ['fields'] hash at the dotted_path.'''
"""Returns the ['fields'] hash at the dotted_path."""
path = dotted_path.split('.')
nesting = fields
for next_field in path:
Expand All @@ -163,17 +163,17 @@ def field_group_at_path(dotted_path, fields):


def calculate_final_values(fields):
'''
"""
This function navigates all fields recursively.
It populates a few more values for the fields, especially path-based values
like flat_name.
'''
"""
visitor.visit_fields_with_path(fields, field_finalizer)


def field_finalizer(details, path):
'''This is the function called by the visitor to perform the work of calculate_final_values'''
"""This is the function called by the visitor to perform the work of calculate_final_values"""
name_array = path + [details['field_details']['node_name']]
flat_name = '.'.join(name_array)
details['field_details']['flat_name'] = flat_name
Expand Down
Loading

0 comments on commit a00eb89

Please sign in to comment.