generate_eval_config: add new tool for generating eval-config thresholds

We had a similar tool, however that bitrot and no longer worked. It was dropped in [1]. Bring something similar back, but it's also quite different. The tool now takes a list of previous runs. It computes a new threshold and writes an "eval-config.yaml". It can also take an existing configuration with the "--config" option. In that case it will keep the entries from the configuration and only update the thresholds. With "--tigthen-only", it will only lower the thresholds, not increase them. This is an initial approach to answer [2]. Probably there are still places to improve later. This is added now, so we have a place for doing such improvements. [1] 137b00d [2] https://issues.redhat.com/browse/NHE-774
ovn-kubernetes · Dec 5, 2024 · 281aebf · 281aebf
1 parent 67a6150
commit 281aebf
Show file tree

Hide file tree

Showing 7 changed files with 1,447 additions and 5 deletions.
diff --git a/eval-config.yaml b/eval-config.yaml
@@ -1,4 +1,5 @@
-# Can use `updated-eval-config` for a more accurate template.
+# Use `generate_eval_config.py` to update the configuration based on earlier
+# runs.
 IPERF_TCP:
   - id: 1 # POD_TO_POD_SAME_NODE
     Normal:

diff --git a/generate_eval_config.py b/generate_eval_config.py
@@ -0,0 +1,291 @@
+#!/usr/bin/env python3
+
+import argparse
+import logging
+import os
+import sys
+
+from collections.abc import Iterable
+from typing import Any
+from typing import Optional
+
+from ktoolbox import common
+
+import tftbase
+
+from evalConfig import Config
+from evalConfig import EvalIdentity
+from tftbase import Bitrate
+from tftbase import TftResults
+
+
+logger = logging.getLogger("tft." + __name__)
+
+
+def load_config(config: Optional[str]) -> Optional[Config]:
+    if not config:
+        return None
+    return Config.parse_from_file(config)
+
+
+@common.iter_tuplify
+def load_logs(
+    logs: Iterable[str],
+    *,
+    skip_invalid_logs: bool = False,
+) -> Iterable[TftResults]:
+    for log in list(logs):
+        try:
+            tft_results = TftResults.parse_from_file(log)
+        except Exception as e:
+            if skip_invalid_logs:
+                # Failures are not fatal here. That is because the output
+                # format is not stable, so if we change the format, we may
+                # be unable to parse certain older logs. Skip them.
+                logger.warning(f"Skip invalid file {repr(log)}: {e}")
+                continue
+            raise
+        yield tft_results
+
+
+def collect_all_bitrates(
+    config: Optional[Config],
+    all_tft_results: Iterable[TftResults],
+) -> dict[EvalIdentity, list[Bitrate]]:
+    result: dict[EvalIdentity, list[Bitrate]]
+
+    if config is not None:
+        result = {ei: [] for ei in config.get_items()}
+    else:
+        result = {}
+
+    for tft_results in all_tft_results:
+        for tft_result in tft_results:
+            if not tft_result.eval_all_success:
+                # This result is not valid. We don't consider it for
+                # calculating the new thresholds.
+                continue
+            flow_test = tft_result.flow_test
+            ei = EvalIdentity.from_metadata(flow_test.tft_metadata)
+            lst = result.get(ei)
+            if lst is None:
+                if config is not None:
+                    # we only collect the items that we have in config too. Don't create a new one.
+                    continue
+                lst = []
+                result[ei] = lst
+            lst.append(flow_test.bitrate_gbps)
+    return result
+
+
+def accumulate_rate(rate: Iterable[Optional[float]]) -> Optional[float]:
+    data = [x for x in rate if x is not None]
+    if not data:
+        return None
+
+    mean = sum(data) / len(data)
+    variance = sum((x - mean) ** 2 for x in data) / (len(data))
+    stddev: float = variance**0.5
+
+    return max(
+        mean - 2.0 * stddev,
+        mean * 0.8,
+    )
+
+
+def accumulate_bitrates(
+    bitrates: list[Bitrate],
+) -> Bitrate:
+    rx = accumulate_rate(bitrate.rx for bitrate in bitrates)
+    tx = accumulate_rate(bitrate.tx for bitrate in bitrates)
+    return Bitrate(rx=rx, tx=tx)
+
+
+def _tighten_rate(
+    a: Optional[float], *, base: Optional[float], tighten_only: bool
+) -> Optional[float]:
+    if base is None:
+        return None
+    if a is None:
+        return base
+    if tighten_only:
+        return min(a, base)
+    return a
+
+
+@common.iter_dictify
+def accumulate_all_bitrates(
+    config: Optional[Config],
+    all_bitrates: dict[EvalIdentity, list[Bitrate]],
+    *,
+    tighten_only: bool,
+) -> Iterable[tuple[EvalIdentity, Bitrate]]:
+    if config is not None:
+        assert list(all_bitrates) == list(config.get_items())
+    for ei, bitrates in all_bitrates.items():
+        bitrate = accumulate_bitrates(bitrates)
+        if config is not None:
+            item = config.get_item_for_id(ei)
+            assert item is not None
+            bitrate2 = item.bitrate
+
+            rx = _tighten_rate(bitrate.rx, base=bitrate2.rx, tighten_only=tighten_only)
+            tx = _tighten_rate(bitrate.tx, base=bitrate2.tx, tighten_only=tighten_only)
+            bitrate = Bitrate(rx=rx, tx=tx)
+        yield ei, bitrate
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(
+        description="Tool to generate eval-config.yaml TFT Flow test results"
+    )
+    parser.add_argument(
+        "logs",
+        nargs="*",
+        help="Result file from ocp-traffic-flow-tests.",
+    )
+    parser.add_argument(
+        "-o",
+        "--output",
+        default="eval-config.yaml",
+        help="Output file to write new eval-config.yaml to.",
+    )
+    parser.add_argument(
+        "-S",
+        "--skip-invalid-logs",
+        action="store_true",
+        help='If set any invalid "--logs" files are ignored. This is useful because the output format is not stable, so your last logs might have been generated with an incompatible version and we want to skip those errors.',
+    )
+    parser.add_argument(
+        "-f",
+        "--force",
+        action="store_true",
+        help="For overwriting output file if it exists.",
+    )
+    parser.add_argument(
+        "-c",
+        "--config",
+        help="The base eval-config. If given, the result will contain all the entries from this input file. Values are updated with the measurementns from the logs.",
+    )
+    parser.add_argument(
+        "-T",
+        "--tighten-only",
+        action="store_true",
+        help="With '--config' the values are only updated if they tighten/lower the thresholds.",
+    )
+    common.log_argparse_add_argument_verbose(parser)
+
+    args = parser.parse_args()
+
+    common.log_config_logger(args.verbose, "tft", "ktoolbox")
+
+    if args.tighten_only and not args.config:
+        logger.error(
+            'Option "--tighten-only" requires a "--config" base configuration.'
+        )
+        sys.exit(4)
+
+    return args
+
+
+def bitrate_to_yaml(bitrate: Bitrate) -> dict[str, Any]:
+    dd: dict[str, Any] = {}
+    common.dict_add_optional(dd, "threshold_rx", bitrate.rx)
+    common.dict_add_optional(dd, "threshold_tx", bitrate.tx)
+    return dd
+
+
+def generate_result_config(
+    config: Optional[Config],
+    bitrates: dict[EvalIdentity, Bitrate],
+) -> Config:
+    new_config: dict[str, list[dict[str, Any]]] = {}
+    handled: set[EvalIdentity] = set()
+    for ei in bitrates:
+        ei, ei_reverse = ei.both_directions()
+
+        if ei in handled:
+            continue
+        handled.add(ei)
+
+        if config is not None:
+            assert config.get_item_for_id(ei) or config.get_item_for_id(ei_reverse)
+
+        bitrate = bitrates.get(ei, Bitrate.NA)
+        bitrate_reverse = bitrates.get(ei_reverse, Bitrate.NA)
+
+        if config is None and bitrate.is_na and bitrate_reverse.is_na:
+            continue
+
+        lst = new_config.get(ei.test_type.name)
+        if lst is None:
+            lst = []
+            new_config[ei.test_type.name] = lst
+
+        list_entry: dict[str, Any] = {
+            "id": ei.test_case_id.name,
+        }
+        if not bitrate.is_na:
+            list_entry["Normal"] = bitrate_to_yaml(bitrate)
+        if not bitrate_reverse.is_na:
+            list_entry["Reverse"] = bitrate_to_yaml(bitrate_reverse)
+
+        lst.append(list_entry)
+
+    # Normalize the generated dictionary by sorting.
+    for lst in new_config.values():
+        lst.sort(key=lambda x: tftbase.TestCaseType[x["id"]].value)
+    keys = [
+        tftbase.TestType(n)
+        for n in sorted(tftbase.TestType[n].value for n in new_config)
+    ]
+    new_config = {test_type.name: new_config[test_type.name] for test_type in keys}
+
+    return Config.parse(new_config)
+
+
+def write_to_file(
+    config: Config,
+    *,
+    output: str,
+    force: bool,
+) -> None:
+
+    if not force and os.path.exists(output):
+        logger.error(
+            f"The output file {repr(output)} already exists. Run with '--force' to overwrite"
+        )
+        sys.exit(55)
+
+    config.serialize_to_file(output)
+
+
+def main() -> None:
+    args = parse_args()
+
+    config = load_config(args.config)
+
+    all_tft_results = load_logs(
+        args.logs,
+        skip_invalid_logs=args.skip_invalid_logs,
+    )
+
+    all_bitrates = collect_all_bitrates(config, all_tft_results)
+
+    new_bitrates = accumulate_all_bitrates(
+        config,
+        all_bitrates,
+        tighten_only=args.tighten_only,
+    )
+
+    result_config = generate_result_config(config, new_bitrates)
+
+    write_to_file(
+        result_config,
+        output=args.output,
+        force=args.force or args.config == args.output,
+    )
+
+
+if __name__ == "__main__":
+    main()