diff --git a/.changes/unreleased/Under the Hood-20220309-142133.yaml b/.changes/unreleased/Under the Hood-20220309-142133.yaml new file mode 100644 index 00000000000..18a01963e16 --- /dev/null +++ b/.changes/unreleased/Under the Hood-20220309-142133.yaml @@ -0,0 +1,7 @@ +kind: Under the Hood +body: add performance regression testing runner without orchestration +time: 2022-03-09T14:21:33.884043-05:00 +custom: + Author: nathaniel-may + Issue: "4021" + PR: "4602" diff --git a/.github/workflows/performance.yml b/.github/workflows/performance.yml deleted file mode 100644 index 1b3c75c74a4..00000000000 --- a/.github/workflows/performance.yml +++ /dev/null @@ -1,176 +0,0 @@ -name: Performance Regression Tests -# Schedule triggers -on: - # runs twice a day at 10:05am and 10:05pm - schedule: - - cron: "5 10,22 * * *" - # Allows you to run this workflow manually from the Actions tab - workflow_dispatch: - -jobs: - # checks fmt of runner code - # purposefully not a dependency of any other job - # will block merging, but not prevent developing - fmt: - name: Cargo fmt - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v2 - - uses: actions-rs/toolchain@v1 - with: - profile: minimal - toolchain: stable - override: true - - run: rustup component add rustfmt - - uses: actions-rs/cargo@v1 - with: - command: fmt - args: --manifest-path performance/runner/Cargo.toml --all -- --check - - # runs any tests associated with the runner - # these tests make sure the runner logic is correct - test-runner: - name: Test Runner - runs-on: ubuntu-latest - env: - # turns errors into warnings - RUSTFLAGS: "-D warnings" - steps: - - uses: actions/checkout@v2 - - uses: actions-rs/toolchain@v1 - with: - profile: minimal - toolchain: stable - override: true - - uses: actions-rs/cargo@v1 - with: - command: test - args: --manifest-path performance/runner/Cargo.toml - - # build an optimized binary to be used as the runner in later steps - build-runner: - needs: [test-runner] - name: Build Runner - runs-on: ubuntu-latest - env: - RUSTFLAGS: "-D warnings" - steps: - - uses: actions/checkout@v2 - - uses: actions-rs/toolchain@v1 - with: - profile: minimal - toolchain: stable - override: true - - uses: actions-rs/cargo@v1 - with: - command: build - args: --release --manifest-path performance/runner/Cargo.toml - - uses: actions/upload-artifact@v2 - with: - name: runner - path: performance/runner/target/release/runner - - # run the performance measurements on the current or default branch - measure-dev: - needs: [build-runner] - name: Measure Dev Branch - runs-on: ubuntu-latest - steps: - - name: checkout dev - uses: actions/checkout@v2 - - name: Setup Python - uses: actions/setup-python@v2.2.2 - with: - python-version: "3.8" - - name: install dbt - run: pip install -r dev-requirements.txt -r editable-requirements.txt - - name: install hyperfine - run: wget https://github.com/sharkdp/hyperfine/releases/download/v1.11.0/hyperfine_1.11.0_amd64.deb && sudo dpkg -i hyperfine_1.11.0_amd64.deb - - uses: actions/download-artifact@v2 - with: - name: runner - - name: change permissions - run: chmod +x ./runner - - name: run - run: ./runner measure -b dev -p ${{ github.workspace }}/performance/projects/ - - uses: actions/upload-artifact@v2 - with: - name: dev-results - path: performance/results/ - - # run the performance measurements on the release branch which we use - # as a performance baseline. This part takes by far the longest, so - # we do everything we can first so the job fails fast. - # ----- - # we need to checkout dbt twice in this job: once for the baseline dbt - # version, and once to get the latest regression testing projects, - # metrics, and runner code from the develop or current branch so that - # the calculations match for both versions of dbt we are comparing. - measure-baseline: - needs: [build-runner] - name: Measure Baseline Branch - runs-on: ubuntu-latest - steps: - - name: checkout latest - uses: actions/checkout@v2 - with: - ref: "0.20.latest" - - name: Setup Python - uses: actions/setup-python@v2.2.2 - with: - python-version: "3.8" - - name: move repo up a level - run: mkdir ${{ github.workspace }}/../baseline/ && cp -r ${{ github.workspace }} ${{ github.workspace }}/../baseline - - name: "[debug] ls new dbt location" - run: ls ${{ github.workspace }}/../baseline/dbt/ - # installation creates egg-links so we have to preserve source - - name: install dbt from new location - run: cd ${{ github.workspace }}/../baseline/dbt/ && pip install -r dev-requirements.txt -r editable-requirements.txt - # checkout the current branch to get all the target projects - # this deletes the old checked out code which is why we had to copy before - - name: checkout dev - uses: actions/checkout@v2 - - name: install hyperfine - run: wget https://github.com/sharkdp/hyperfine/releases/download/v1.11.0/hyperfine_1.11.0_amd64.deb && sudo dpkg -i hyperfine_1.11.0_amd64.deb - - uses: actions/download-artifact@v2 - with: - name: runner - - name: change permissions - run: chmod +x ./runner - - name: run runner - run: ./runner measure -b baseline -p ${{ github.workspace }}/performance/projects/ - - uses: actions/upload-artifact@v2 - with: - name: baseline-results - path: performance/results/ - - # detect regressions on the output generated from measuring - # the two branches. Exits with non-zero code if a regression is detected. - calculate-regressions: - needs: [measure-dev, measure-baseline] - name: Compare Results - runs-on: ubuntu-latest - steps: - - uses: actions/download-artifact@v2 - with: - name: dev-results - - uses: actions/download-artifact@v2 - with: - name: baseline-results - - name: "[debug] ls result files" - run: ls - - uses: actions/download-artifact@v2 - with: - name: runner - - name: change permissions - run: chmod +x ./runner - - name: make results directory - run: mkdir ./final-output/ - - name: run calculation - run: ./runner calculate -r ./ -o ./final-output/ - # always attempt to upload the results even if there were regressions found - - uses: actions/upload-artifact@v2 - if: ${{ always() }} - with: - name: final-calculations - path: ./final-output/* diff --git a/performance/README.md b/performance/README.md index 71aba30b989..1c7ce23b884 100644 --- a/performance/README.md +++ b/performance/README.md @@ -1,18 +1,118 @@ # Performance Regression Testing -This directory includes dbt project setups to test on and a test runner written in Rust which runs specific dbt commands on each of the projects. Orchestration is done via the GitHub Action workflow in `/.github/workflows/performance.yml`. The workflow is scheduled to run every night, but it can also be triggered manually. -The github workflow hardcodes our baseline branch for performance metrics as `0.20.latest`. As future versions become faster, this branch will be updated to hold us to those new standards. +## Attention! -## Adding a new dbt project -Just make a new directory under `performance/projects/`. It will automatically be picked up by the tests. +PLEASE READ THIS README IN THE MAIN BRANCH +The performance runner is always pulled from main regardless of the version being modeled or sampled. If you are not in the main branch, this information may be stale. -## Adding a new dbt command -In `runner/src/measure.rs::measure` add a metric to the `metrics` Vec. The Github Action will handle recompilation if you don't have the rust toolchain installed. +## Description + +This test suite samples the performance characteristics of individual commits against performance models for prior releases. Performance is measured in project-command pairs which are assumed to conform to a normal distribution. The sampling and comparison is effecient enough to run against PRs. + +This collection of projects and commands should expand over time to reflect user feedback about poorly performing projects to protect against poor performance in these scenarios in future versions. + +Here are all the components of the testing module: + +- dbt project setups that are known performance bottlenecks which you can find in `/performance/projects/`, and a runner written in Rust that runs specific dbt commands on each of the projects. +- Performance characteristics called "baselines" from released dbt versions in `/performance/baselines/`. Each branch will only have the baselines for its ancestors because when we compare samples, we compare against the lastest baseline available in the branch. +- A GitHub action for modeling the performance distribution for a new release: `/.github/workflows/model_performance.yml`. +- A GitHub action for sampling performance of dbt at your commit and comparing it against a previous release: `/.github/workflows/sample_performance.yml`. + +At this time, the biggest risk in the design of this project is how to account for the natural variation of GitHub Action runs. Typically, performance work is done on dedicated hardware to elimiate this factor. However, there are ways to integrate the variation in obeservation tools if it can be measured. + +## Adding Test Scenarios + +A clear process for maintainers and community members to add new performance testing targets will exist after the next stage of the test suite is complete. For details, see #4768. + +## Investigating Regressions + +If your commit has failed one of the performance regression tests, it does not necessarily mean your commit has a performance regression. However, the observed runtime value was so much slower than the expected value that it was unlikely to be random noise. If it is not due to random noise, this commit contains the code that is causing this performance regression. However, it may not be the commit that introduced that code. That code may have been introduced in the commit before even if it passed due to natural variation in sampling. When investigating a performance regression, start with the failing commit and working your way backwards. + +Here's an example of how this could happen: + +``` +Commit +A <- last release +B +C <- perf regression +D +E +F <- the first failing commit +``` +- Commit A is measured to have an expected value for one performance metric of 30 seconds with a standard deviation of 0.5 seconds. +- Commit B doesn't introduce a performance regression and passes the performance regression tests. +- Commit C introduces a performance regression such that the new expected value of the metric is 32 seconds with a standard deviation still at 0.5 seconds, but we don't know this because we don't estimate the whole performance distribution on every commit because that is far too much work to run on every commit. It passes the performance regression test because we happened to sample a value of 31 seconds which is within our threshold for the original model. It's also only 2 standard deviations away from the actual performance model of commit C so even though it's not going to be a super common situation, it is expected to happen sometimes. +- Commit D samples a value of 31.4 seconds and passes +- Commit E samples a value of 31.2 seconds and passes +- Commit F samples a value of 32.9 seconds and fails + +Because these performance regression tests are non-deterministic, it is frequently going to be possible to rerun the test on a failing commit and get it to pass. The more often we do this, the farther down the commit history we will be punting detection. + +If your PR is against `main` your commits will be compared against the latest baseline measurement found in `performance/baselines`. If this commit needs to be backported, that PR will be against the `.latest` branch and will also compare against the latest baseline measurement found in `performance/baselines` in that branch. These two versions may be the same or they may be different. For example, If the latest version of dbt is v1.99.0, the performance sample of your PR against main will compare against the baseline for v1.99.0. When those commits are backported to `1.98.latest` those commits will be compared against the baseline for v1.98.6 (or whatever the latest is at that time). Even if the compared baseline is the same, a different sample is taken for each PR. In this case, even though it should be rare, it is possible for a performance regression to be detected in one of the two PRs even with the same baseline due to variation in sampling. + +## The Statistics +Particle physicists need to be confident in declaring new discoveries, snack manufacturers need to be sure each individual item is within the regulated margin of error for nutrition facts, and weight-rated climbing gear needs to be produced so you can trust your life to every unit that comes off the line. All of these use cases use the same kind of math to meet their needs: sigma-based p-values. This section will peel apart that math with the help of a physicist and walk through how we apply this approach to performance regression testing in this test suite. + +You are likely familiar with forming a hypothesis of the form "A and B are correlated" which is known as _the research hypothesis_. Additionally, it follows that the hypothesis "A and B are not correlated" is relevant and is known as _the null hypothesis_. When looking at data, we commonly use a _p-value_ to determine the significance of the data. Formally, a _p-value_ is the probability of obtaining data at least as extreme as the ones observed, if the null hypothesis is true. To refine this definition, The experimental partical physicist [Dr. Tommaso Dorigo](https://userswww.pd.infn.it/~dorigo/#about) has an excellent [glossary](https://www.science20.com/quantum_diaries_survivor/fundamental_glossary_higgs_broadcast-85365) of these terms that helps clarify: "'Extreme' is quite tricky instead: it depends on what is your 'alternate hypothesis' of reference, and what kind of departure it would produce on the studied statistic derived from the data. So 'extreme' will mean 'departing from the typical values expected for the null hypothesis, toward the values expected from the alternate hypothesis.'" In the context of performance regression testing, our research hypothesis is that "after commit A, the codebase includes a performance regression" which means we expect the runtime of our measured processes to be _slower_, not faster than the expected value. + +Given this definition of p-value, we need to explicitly call out the common tendancy to apply _probability inversion_ to our observations. To quote [Dr. Tommaso Dorigo](https://www.science20.com/quantum_diaries_survivor/fundamental_glossary_higgs_broadcast-85365) again, "If your ability on the long jump puts you in the 99.99% percentile, that does not mean that you are a kangaroo, and neither can one infer that the probability that you belong to the human race is 0.01%." Using our previously defined terms, the p-value is _not_ the probability that the null hypothesis _is true_. + +This brings us to calculating sigma values. Sigma refers to the standard deviation of a statistical model, which is used as a measurement of how far away an observed value is from the expected value. When we say that we have a "3 sigma result" we are saying that if the null hypothesis is true, this is a particularly unlikely observation—not that the null hypothesis is false. Exactly how unlikely depends on what the expected values from our research hypothesis are. In the context of performance regression testing, if the null hypothesis is false, we are expecting the results to be _slower_ than the expected value not _slower or faster_. Looking at a normal distrubiton below, we can see that we only care about one _half_ of the distribution: the half where the values are slower than the expected value. This means that when we're calculating the p-value we are not including both sides of the normal distribution. + +![normal distibution](./images/normal.svg) + +Because of this, the following table describes the significance of each sigma level for our _one-sided_ hypothesis: + +| σ | p-value | scientific significance | +| --- | -------------- | ----------------------- | +| 1 σ | 1 in 6 | | +| 2 σ | 1 in 44 | | +| 3 σ | 1 in 741 | evidence | +| 4 σ | 1 in 31,574 | | +| 5 σ | 1 in 3,486,914 | discovery | + +When detecting performance regressions that trigger alerts, block PRs, or delay releases we want to be conservative enough that detections are infrequently triggered by noise, but not so conservative as to miss most actual regressions. This test suite uses a 3 sigma standard so that only about 1 in every 700 runs is expected to fail the performance regression test suite due to expected variance in our measurements. + +In practice, the number of performance regression failures due to random noise will be higher because we are not incorporating the variance of the tools we use to measure, namely GHA. + +### Concrete Example: Performance Regression Detection + +The following example data was collected by running the code in this repository in Github Actions. + +In dbt v1.0.3, we have the following mean and standard deviation when parsing a dbt project with 2000 models: + +μ (mean): 41.22
+σ (stddev): 0.2525
+ +The 2-sided 3 sigma range can be calculated with these two values via: + +x < μ - 3 σ or x > μ + 3 σ
+x < 41.22 - 3 * 0.2525 or x > 41.22 + 3 * 0.2525
+x < 40.46 or x > 41.98
+ +It follows that the 1-sided 3 sigma range for performance regressions is just:
+x > 41.98 + +If when we sample a single `dbt parse` of the same project with a commit slated to go into dbt v1.0.4, we observe a 42s parse time, then this observation is so unlikely if there were no code-induced performance regressions, that we should investigate if there is a performance regression in any of the commits between this failure and the commit where the initial distribution was measured. + +Observations with 3 sigma significance that are _not_ performance regressions could be due to observing unlikely values (roughly 1 in every 750 observations), or variations in the instruments we use to take these measurements such as github actions. At this time we do not measure the variation in the instruments we use to account for these in our calculations which means failures due to random noise are more likely than they would be if we did take them into account. + +### Concrete Example: Performance Modeling + +Once a new dbt version is released (excluding pre-releases), the performance characteristics of that released version need to be measured. In this repository this measurement is referred to as a baseline. + +After dbt v1.0.99 is released, a github action running from `main`, for the latest version of that action, takes the following steps: +- Checks out main for the latest performance runner +- pip installs dbt v1.0.99 +- builds the runner if it's not already in the github actions cache +- uses the performance runner model sub command with `./runner model`. +- The model subcommand calls hyperfine to run all of the project-command pairs a large number of times (maybe 20 or so) and save the hyperfine outputs to files in `performance/baselines/1.0.99/` one file per command-project pair. +- The action opens two PRs with these files: one against `main` and one against `1.0.latest` so that future PRs against these branches will detect regressions against the performance characteristics of dbt v1.0.99 instead of v1.0.98. +- The release driver for dbt v1.0.99 reviews and merges these PRs which is the sole deliverable of the performance modeling work. ## Future work -- add more projects to test different configurations that have been known bottlenecks -- add more dbt commands to measure -- possibly using the uploaded json artifacts to store these results so they can be graphed over time -- reading new metrics from a file so no one has to edit rust source to add them to the suite -- instead of building the rust every time, we could publish and pull down the latest version. -- instead of manually setting the baseline version of dbt to test, pull down the latest stable version as the baseline. +- pin commands to projects by reading commands from a file defined in the project. +- add a postgres warehouse to run `dbt compile` and `dbt run` commands +- add more projects to test different configurations that have been known performance bottlenecks +- Account for github action variation: Either measure it, or eliminate it. To measure it we could set up another action that periodically samples the same version of dbt and use a 7 day rolling variation. To eliminate it we could run the action using something like [act](https://github.com/nektos/act) on dedicated hardware. +- build in a git-bisect run to automatically identify the commits that caused a performance regression by modeling each commit's expected value for the failing metric. Running this automatically, or even providing a script to do this locally would be useful. diff --git a/performance/baselines/.gitignore b/performance/baselines/.gitignore new file mode 100644 index 00000000000..099edfcd3d1 --- /dev/null +++ b/performance/baselines/.gitignore @@ -0,0 +1 @@ +# placeholder for baselines directory diff --git a/performance/images/normal.svg b/performance/images/normal.svg new file mode 100644 index 00000000000..365713ba510 --- /dev/null +++ b/performance/images/normal.svg @@ -0,0 +1 @@ + diff --git a/performance/runner/Cargo.lock b/performance/runner/Cargo.lock index 4e7b6bba517..a6383b23e25 100644 --- a/performance/runner/Cargo.lock +++ b/performance/runner/Cargo.lock @@ -57,18 +57,59 @@ dependencies = [ "ansi_term", "atty", "bitflags", - "strsim", + "strsim 0.8.0", "textwrap", "unicode-width", "vec_map", ] +[[package]] +name = "darling" +version = "0.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "757c0ded2af11d8e739c4daea1ac623dd1624b06c844cf3f5a39f1bdbd99bb12" +dependencies = [ + "darling_core", + "darling_macro", +] + +[[package]] +name = "darling_core" +version = "0.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2c34d8efb62d0c2d7f60ece80f75e5c63c1588ba68032740494b0b9a996466e3" +dependencies = [ + "fnv", + "ident_case", + "proc-macro2", + "quote", + "strsim 0.10.0", + "syn", +] + +[[package]] +name = "darling_macro" +version = "0.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ade7bff147130fe5e6d39f089c6bd49ec0250f35d70b2eebf72afdfc919f15cc" +dependencies = [ + "darling_core", + "quote", + "syn", +] + [[package]] name = "either" version = "1.6.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e78d4f1cc4ae33bbfc157ed5d5a5ef3bc29227303d595861deb238fcec4e9457" +[[package]] +name = "fnv" +version = "1.0.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1" + [[package]] name = "heck" version = "0.3.3" @@ -87,6 +128,12 @@ dependencies = [ "libc", ] +[[package]] +name = "ident_case" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b9e0384b61958566e926dc50660321d12159025e767c18e043daf26b70104c39" + [[package]] name = "itertools" version = "0.10.1" @@ -183,10 +230,17 @@ dependencies = [ "itertools", "serde", "serde_json", + "serde_with", "structopt", "thiserror", ] +[[package]] +name = "rustversion" +version = "1.0.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "61b3909d758bb75c79f23d4736fac9433868679d3ad2ea7a61e3c25cfda9a088" + [[package]] name = "ryu" version = "1.0.5" @@ -224,12 +278,41 @@ dependencies = [ "serde", ] +[[package]] +name = "serde_with" +version = "1.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ad6056b4cb69b6e43e3a0f055def223380baecc99da683884f205bf347f7c4b3" +dependencies = [ + "rustversion", + "serde", + "serde_with_macros", +] + +[[package]] +name = "serde_with_macros" +version = "1.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "12e47be9471c72889ebafb5e14d5ff930d89ae7a67bbdb5f8abb564f845a927e" +dependencies = [ + "darling", + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "strsim" version = "0.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8ea5119cdb4c55b55d432abb513a0429384878c15dde60cc77b1c99de1a95a6a" +[[package]] +name = "strsim" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "73473c0e59e6d5812c5dfe2a064a6444949f089e20eec9a2e5506596494e4623" + [[package]] name = "structopt" version = "0.3.22" diff --git a/performance/runner/Cargo.toml b/performance/runner/Cargo.toml index a093ecc64bc..6fb48ce825e 100644 --- a/performance/runner/Cargo.toml +++ b/performance/runner/Cargo.toml @@ -8,5 +8,6 @@ chrono = { version = "0.4.19", features = ["serde"] } itertools = "0.10.1" serde = { version = "1.0", features = ["derive"] } serde_json = "1.0" +serde_with = { version = "1.11.0", features = [ "macros" ] } structopt = "0.3" thiserror = "1.0.26" diff --git a/performance/runner/src/calculate.rs b/performance/runner/src/calculate.rs index 9edcdc43a40..980f70d67bc 100644 --- a/performance/runner/src/calculate.rs +++ b/performance/runner/src/calculate.rs @@ -1,278 +1,171 @@ -use crate::exceptions::{CalculateError, IOError}; -use chrono::prelude::*; -use itertools::Itertools; -use serde::{Deserialize, Serialize}; -use std::fs; -use std::fs::DirEntry; -use std::path::{Path, PathBuf}; - -// This type exactly matches the type of array elements -// from hyperfine's output. Deriving `Serialize` and `Deserialize` -// gives us read and write capabilities via json_serde. -#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] -pub struct Measurement { - pub command: String, - pub mean: f64, - pub stddev: f64, - pub median: f64, - pub user: f64, - pub system: f64, - pub min: f64, - pub max: f64, - pub times: Vec, -} - -// This type exactly matches the type of hyperfine's output. -// Deriving `Serialize` and `Deserialize` gives us read and -// write capabilities via json_serde. -#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] -pub struct Measurements { - pub results: Vec, -} - -// Output data from a comparison between runs on the baseline -// and dev branches. -#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] -pub struct Data { - pub threshold: f64, - pub difference: f64, - pub baseline: f64, - pub dev: f64, -} - -// The full output from a comparison between runs on the baseline -// and dev branches. -#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] -pub struct Calculation { - pub metric: String, - pub regression: bool, - pub ts: DateTime, - pub data: Data, -} - -// A type to describe which measurement we are working with. This -// information is parsed from the filename of hyperfine's output. -#[derive(Debug, Clone, PartialEq)] -pub struct MeasurementGroup { - pub version: String, - pub run: String, - pub measurement: Measurement, -} - -// Given two measurements, return all the calculations. Calculations are -// flagged as regressions or not regressions. -fn calculate(metric: &str, dev: &Measurement, baseline: &Measurement) -> Vec { - // choosing the current timestamp for all calculations to be the same. - // this timestamp is not from the time of measurement becuase hyperfine - // controls that. Since calculation is run directly after, this is fine. - let ts = Utc::now(); - - let median_threshold = 1.05; // 5% regression threshold - let median_difference = dev.median / baseline.median; - - let stddev_threshold = 1.20; // 20% regression threshold - let stddev_difference = dev.stddev / baseline.stddev; - - vec![ - Calculation { - metric: ["median", metric].join("_"), - regression: median_difference > median_threshold, - ts: ts, - data: Data { - threshold: median_threshold, - difference: median_difference, - baseline: baseline.median, - dev: dev.median, - }, - }, - Calculation { - metric: ["stddev", metric].join("_"), - regression: stddev_difference > stddev_threshold, - ts: ts, - data: Data { - threshold: stddev_threshold, - difference: stddev_difference, - baseline: baseline.stddev, - dev: dev.stddev, - }, - }, - ] +use crate::exceptions::RunnerError; +use crate::fs; +use crate::types::*; +use std::collections::HashMap; +use std::path::PathBuf; + +// calculates a single regression for a matching sample-baseline pair. +// does not validate that the sample metric and baseline metric match. +fn calculate_regression(sample: &Sample, baseline: &Baseline, sigma: f64) -> Calculation { + let model = baseline.measurement.clone(); + let threshold = model.mean + sigma * model.stddev; + + Calculation { + version: baseline.version, + metric: baseline.metric.clone(), + regression: sample.value > threshold, + ts: sample.ts.clone(), + sigma: sigma, + mean: model.mean, + stddev: model.stddev, + threshold: threshold, + sample: sample.value, + } } -// Given a directory, read all files in the directory and return each -// filename with the deserialized json contents of that file. -fn measurements_from_files( - results_directory: &Path, -) -> Result, CalculateError> { - fs::read_dir(results_directory) - .or_else(|e| Err(IOError::ReadErr(results_directory.to_path_buf(), Some(e)))) - .or_else(|e| Err(CalculateError::CalculateIOError(e)))? +// Top-level function. Given a path for the result directory, call the above +// functions to compare and collect calculations. Calculations include all samples +// regardless of whether they passed or failed. +pub fn regressions( + baseline_dir: &PathBuf, + projects_dir: &PathBuf, + tmp_dir: &PathBuf, +) -> Result, RunnerError> { + // finds the latest version availble in the baseline directory + let latest_version = fs::latest_version_from(baseline_dir)?; + + // the baseline we want to target is the latest one available in this branch + let mut target_baseline_dir = baseline_dir.clone(); + target_baseline_dir.push(latest_version.to_string()); + + // these are all the metrics for all available baselines from the target version + let baselines: Vec = fs::from_json_files::(&target_baseline_dir)? .into_iter() - .map(|entry| { - let ent: DirEntry = entry - .or_else(|e| Err(IOError::ReadErr(results_directory.to_path_buf(), Some(e)))) - .or_else(|e| Err(CalculateError::CalculateIOError(e)))?; - - Ok(ent.path()) - }) - .collect::, CalculateError>>()? - .iter() - .filter(|path| { - path.extension() - .and_then(|ext| ext.to_str()) - .map_or(false, |ext| ext.ends_with("json")) - }) - .map(|path| { - fs::read_to_string(path) - .or_else(|e| Err(IOError::BadFileContentsErr(path.clone(), Some(e)))) - .or_else(|e| Err(CalculateError::CalculateIOError(e))) - .and_then(|contents| { - serde_json::from_str::(&contents) - .or_else(|e| Err(CalculateError::BadJSONErr(path.clone(), Some(e)))) - }) - .map(|m| (path.clone(), m)) - }) - .collect() -} - -// Given a list of filename-measurement pairs, detect any regressions by grouping -// measurements together by filename. -fn calculate_regressions( - measurements: &[(&PathBuf, &Measurement)], -) -> Result, CalculateError> { - /* - Strategy of this function body: - 1. [Measurement] -> [MeasurementGroup] - 2. Sort the MeasurementGroups - 3. Group the MeasurementGroups by "run" - 4. Call `calculate` with the two resulting Measurements as input - */ - - let mut measurement_groups: Vec = measurements - .iter() - .map(|(p, m)| { - p.file_name() - .ok_or_else(|| IOError::MissingFilenameErr(p.to_path_buf())) - .and_then(|name| { - name.to_str() - .ok_or_else(|| IOError::FilenameNotUnicodeErr(p.to_path_buf())) - }) - .map(|name| { - let parts: Vec<&str> = name.split("_").collect(); - MeasurementGroup { - version: parts[0].to_owned(), - run: parts[1..].join("_"), - measurement: (*m).clone(), - } - }) - }) - .collect::, IOError>>() - .or_else(|e| Err(CalculateError::CalculateIOError(e)))?; + .map(|(_, x)| x) + .collect(); + + // check that we have at least one baseline. + // If this error is returned, in all liklihood the runner cannot read the existing baselines + // for some reason. Every branch is expected to have baselines from when they were branched off of main. + if baselines.is_empty() { + return Err(RunnerError::NoVersionedBaselineData( + target_baseline_dir.clone(), + )); + } - measurement_groups.sort_by(|x, y| (&x.run, &x.version).cmp(&(&y.run, &y.version))); + let samples: Vec = fs::take_samples(projects_dir, tmp_dir)?; - // locking up mutation - let sorted_measurement_groups = measurement_groups; + // turn samples into a map so they can be easily matched to baseline data + let m_samples: HashMap = + samples.into_iter().map(|x| (x.metric.clone(), x)).collect(); - let calculations: Vec = sorted_measurement_groups - .iter() - .group_by(|x| &x.run) + // match all baseline metrics to samples taken and calculate regressions with a 3 sigma threshold + let calculations: Vec = baselines .into_iter() - .map(|(_, g)| { - let mut groups: Vec<&MeasurementGroup> = g.collect(); - groups.sort_by(|x, y| x.version.cmp(&y.version)); - - match groups.len() { - 2 => { - let dev = &groups[1]; - let baseline = &groups[0]; - - if dev.version == "dev" && baseline.version == "baseline" { - Ok(calculate(&dev.run, &dev.measurement, &baseline.measurement)) - } else { - Err(CalculateError::BadBranchNameErr( - baseline.version.clone(), - dev.version.clone(), - )) - } - } - i => { - let gs: Vec = groups.into_iter().map(|x| x.clone()).collect(); - Err(CalculateError::BadGroupSizeErr(i, gs)) - } - } + .map(|baseline| { + m_samples + .get(&baseline.metric) + .ok_or_else(|| RunnerError::BaselineMetricNotSampled(baseline.metric.clone())) + .map(|sample| calculate_regression(&sample, &baseline, 3.0)) }) - .collect::>, CalculateError>>()? - .concat(); + .collect::, RunnerError>>()?; Ok(calculations) } -// Top-level function. Given a path for the result directory, call the above -// functions to compare and collect calculations. Calculations include both -// metrics that fall within the threshold and regressions. -pub fn regressions(results_directory: &PathBuf) -> Result, CalculateError> { - measurements_from_files(Path::new(&results_directory)).and_then(|v| { - // exit early with an Err if there are no results to process - if v.len() <= 0 { - Err(CalculateError::NoResultsErr(results_directory.clone())) - // we expect two runs for each project-metric pairing: one for each branch, baseline - // and dev. An odd result count is unexpected. - } else if v.len() % 2 == 1 { - Err(CalculateError::OddResultsCountErr( - v.len(), - results_directory.clone(), - )) - } else { - // otherwise, we can do our comparisons - let measurements = v - .iter() - // the way we're running these, the files will each contain exactly one measurement, hence `results[0]` - .map(|(p, ms)| (p, &ms.results[0])) - .collect::>(); - - calculate_regressions(&measurements[..]) - } - }) -} - #[cfg(test)] mod tests { use super::*; + use chrono::prelude::*; #[test] - fn detects_5_percent_regression() { - let dev = Measurement { - command: "some command".to_owned(), - mean: 1.06, - stddev: 1.06, - median: 1.06, - user: 1.06, - system: 1.06, - min: 1.06, - max: 1.06, - times: vec![], + fn detects_3sigma_regression() { + let metric = Metric { + name: "test".to_owned(), + project_name: "detects 3 sigma".to_owned(), }; - let baseline = Measurement { - command: "some command".to_owned(), - mean: 1.00, - stddev: 1.00, - median: 1.00, - user: 1.00, - system: 1.00, - min: 1.00, - max: 1.00, - times: vec![], + let baseline = Baseline { + version: Version::new(9, 9, 9), + metric: metric.clone(), + ts: Utc::now(), + measurement: Measurement { + command: "some command".to_owned(), + mean: 1.00, + stddev: 0.1, + median: 1.00, + user: 1.00, + system: 1.00, + min: 0.00, + max: 2.00, + times: vec![], + }, + }; + + let sample = Sample { + metric: metric, + value: 1.31, + ts: Utc::now(), }; - let calculations = calculate("test_metric", &dev, &baseline); - let regressions: Vec<&Calculation> = - calculations.iter().filter(|calc| calc.regression).collect(); + let calculation = calculate_regression( + &sample, &baseline, 3.0, // 3 sigma + ); - // expect one regression for median - println!("{:#?}", regressions); - assert_eq!(regressions.len(), 1); - assert_eq!(regressions[0].metric, "median_test_metric"); + // expect a regression for the mean being outside the 3 sigma + println!("{:#?}", calculation); + assert!(calculation.regression); + } + + #[test] + fn passes_near_3sigma() { + let metric = Metric { + name: "test".to_owned(), + project_name: "passes near 3 sigma".to_owned(), + }; + + let baseline = Baseline { + version: Version::new(9, 9, 9), + metric: metric.clone(), + ts: Utc::now(), + measurement: Measurement { + command: "some command".to_owned(), + mean: 1.00, + stddev: 0.1, + median: 1.00, + user: 1.00, + system: 1.00, + min: 0.00, + max: 2.00, + times: vec![], + }, + }; + + let sample = Sample { + metric: metric, + value: 1.29, + ts: Utc::now(), + }; + + let calculation = calculate_regression( + &sample, &baseline, 3.0, // 3 sigma + ); + + // expect no regressions + println!("{:#?}", calculation); + assert!(!calculation.regression); + } + + // The serializer and deserializer are custom implementations + // so they should be tested that they match. + #[test] + fn version_serialize_loop() { + let v = Version { + major: 1, + minor: 2, + patch: 3, + }; + let v2 = serde_json::from_str::(&serde_json::to_string_pretty(&v).unwrap()); + assert_eq!(v, v2.unwrap()); } } diff --git a/performance/runner/src/exceptions.rs b/performance/runner/src/exceptions.rs index f278da6c365..bd04704b4d8 100644 --- a/performance/runner/src/exceptions.rs +++ b/performance/runner/src/exceptions.rs @@ -1,7 +1,5 @@ -use crate::calculate::*; +use crate::types::Metric; use std::io; -#[cfg(test)] -use std::path::Path; use std::path::PathBuf; use thiserror::Error; @@ -12,6 +10,8 @@ use thiserror::Error; pub enum IOError { #[error("ReadErr: The file cannot be read.\nFilepath: {}\nOriginating Exception: {}", .0.to_string_lossy().into_owned(), .1.as_ref().map_or("None".to_owned(), |e| format!("{}", e)))] ReadErr(PathBuf, Option), + #[error("WriteErr: The file cannot be written to.\nFilepath: {}\nOriginating Exception: {}", .0.to_string_lossy().into_owned(), .1.as_ref().map_or("None".to_owned(), |e| format!("{}", e)))] + WriteErr(PathBuf, Option), #[error("MissingFilenameErr: The path provided does not specify a file.\nFilepath: {}", .0.to_string_lossy().into_owned())] MissingFilenameErr(PathBuf), #[error("FilenameNotUnicodeErr: The filename is not expressible in unicode. Consider renaming the file.\nFilepath: {}", .0.to_string_lossy().into_owned())] @@ -20,136 +20,40 @@ pub enum IOError { BadFileContentsErr(PathBuf, Option), #[error("CommandErr: System command failed to run.\nOriginating Exception: {}", .0.as_ref().map_or("None".to_owned(), |e| format!("{}", e)))] CommandErr(Option), + #[error("CannotRecreateTempDirErr: attempted to delete and recreate temp dir at path {}\nOriginating Exception: {}", .0.to_string_lossy().into_owned(), .1)] + CannotRecreateTempDirErr(PathBuf, io::Error), + #[error("BadFilestemError: failed to read the filestem from path {}", .0.to_string_lossy().into_owned())] + BadFilestemError(PathBuf), + #[error("ReadIterErr: While traversing a directory, an error occured.\nDirectory: {}\nOriginating Exception: {}", .0.to_string_lossy().into_owned(), .1.as_ref().map_or("None".to_owned(), |e| format!("{}", e)))] + ReadIterErr(PathBuf, Option), } // Custom Error messages for the error states we could encounter -// during calculation, and are not prevented at compile time. New -// constructors should be added for any new error situations that +// in any of the subcommands, and are not prevented at compile time. +// New constructors should be added for any new error situations that // come up. The desired output of these errors is tested below. #[derive(Debug, Error)] -pub enum CalculateError { - #[error("BadJSONErr: JSON in file cannot be deserialized as expected.\nFilepath: {}\nOriginating Exception: {}", .0.to_string_lossy().into_owned(), .1.as_ref().map_or("None".to_owned(), |e| format!("{}", e)))] - BadJSONErr(PathBuf, Option), +pub enum RunnerError { + #[error("VersionParseFail: Error parsing input `{}`. Must be in the format \"major.minor.patch\" where each component is an integer.", .0)] + VersionParseFail(String), + #[error("MetricParseFail: Error parsing input `{}`. Must be in the format \"metricname___projectname\" with no file extensions.", .0)] + MetricParseFail(String), + #[error("BadJSONErr: JSON in file cannot be deserialized as expected.\nRaw json: {}\nOriginating Exception: {}", .0, .1.as_ref().map_or("None".to_owned(), |e| format!("{}", e)))] + BadJSONErr(String, Option), + #[error("SerializationErr: Object cannot be serialized as expected.\nOriginating Exception: {}", .0)] + SerializationErr(serde_json::Error), #[error("{}", .0)] - CalculateIOError(IOError), - #[error("NoResultsErr: The results directory has no json files in it.\nFilepath: {}", .0.to_string_lossy().into_owned())] - NoResultsErr(PathBuf), - #[error("OddResultsCountErr: The results directory has an odd number of results in it. Expected an even number.\nFile Count: {}\nFilepath: {}", .0, .1.to_string_lossy().into_owned())] - OddResultsCountErr(usize, PathBuf), - #[error("BadGroupSizeErr: Expected two results per group, one for each branch-project pair.\nCount: {}\nGroup: {:?}", .0, .1.into_iter().map(|group| (&group.version[..], &group.run[..])).collect::>())] - BadGroupSizeErr(usize, Vec), - #[error("BadBranchNameErr: Branch names must be 'baseline' and 'dev'.\nFound: {}, {}", .0, .1)] - BadBranchNameErr(String, String), + RunnerIOError(IOError), + #[error("HyperfineNonZeroExitCode: Hyperfine child process exited with non-zero exit code: {}", .0)] + HyperfineNonZeroExitCode(i32), + #[error("NoVersionedBaselineData: There was no versioned data in the following directory: {}\n expected structure like //metric.json", .0.to_string_lossy().into_owned())] + NoVersionedBaselineData(PathBuf), + #[error("BaselineMetricNotSampled: The metric {} on project {} was included in the baseline comparison but was not sampled.", .0.name, .0.project_name)] + BaselineMetricNotSampled(Metric), } -// Tests for exceptions -#[cfg(test)] -mod tests { - use super::*; - - // Tests the output fo io error messages. There should be at least one per enum constructor. - #[test] - fn test_io_error_messages() { - let pairs = vec![ - ( - IOError::ReadErr(Path::new("dummy/path/file.json").to_path_buf(), None), - r#"ReadErr: The file cannot be read. -Filepath: dummy/path/file.json -Originating Exception: None"#, - ), - ( - IOError::MissingFilenameErr(Path::new("dummy/path/no_file/").to_path_buf()), - r#"MissingFilenameErr: The path provided does not specify a file. -Filepath: dummy/path/no_file/"#, - ), - ( - IOError::FilenameNotUnicodeErr(Path::new("dummy/path/no_file/").to_path_buf()), - r#"FilenameNotUnicodeErr: The filename is not expressible in unicode. Consider renaming the file. -Filepath: dummy/path/no_file/"#, - ), - ( - IOError::BadFileContentsErr( - Path::new("dummy/path/filenotexist.json").to_path_buf(), - None, - ), - r#"BadFileContentsErr: Check that the file exists and is readable. -Filepath: dummy/path/filenotexist.json -Originating Exception: None"#, - ), - ( - IOError::CommandErr(None), - r#"CommandErr: System command failed to run. -Originating Exception: None"#, - ), - ]; - - for (err, msg) in pairs { - assert_eq!(format!("{}", err), msg) - } - } - - // Tests the output fo calculate error messages. There should be at least one per enum constructor. - #[test] - fn test_calculate_error_messages() { - let pairs = vec![ - ( - CalculateError::BadJSONErr(Path::new("dummy/path/file.json").to_path_buf(), None), - r#"BadJSONErr: JSON in file cannot be deserialized as expected. -Filepath: dummy/path/file.json -Originating Exception: None"#, - ), - ( - CalculateError::BadJSONErr(Path::new("dummy/path/file.json").to_path_buf(), None), - r#"BadJSONErr: JSON in file cannot be deserialized as expected. -Filepath: dummy/path/file.json -Originating Exception: None"#, - ), - ( - CalculateError::NoResultsErr(Path::new("dummy/path/no_file/").to_path_buf()), - r#"NoResultsErr: The results directory has no json files in it. -Filepath: dummy/path/no_file/"#, - ), - ( - CalculateError::OddResultsCountErr( - 3, - Path::new("dummy/path/no_file/").to_path_buf(), - ), - r#"OddResultsCountErr: The results directory has an odd number of results in it. Expected an even number. -File Count: 3 -Filepath: dummy/path/no_file/"#, - ), - ( - CalculateError::BadGroupSizeErr( - 1, - vec![MeasurementGroup { - version: "dev".to_owned(), - run: "some command".to_owned(), - measurement: Measurement { - command: "some command".to_owned(), - mean: 1.0, - stddev: 1.0, - median: 1.0, - user: 1.0, - system: 1.0, - min: 1.0, - max: 1.0, - times: vec![1.0, 1.1, 0.9, 1.0, 1.1, 0.9, 1.1], - }, - }], - ), - r#"BadGroupSizeErr: Expected two results per group, one for each branch-project pair. -Count: 1 -Group: [("dev", "some command")]"#, - ), - ( - CalculateError::BadBranchNameErr("boop".to_owned(), "noop".to_owned()), - r#"BadBranchNameErr: Branch names must be 'baseline' and 'dev'. -Found: boop, noop"#, - ), - ]; - - for (err, msg) in pairs { - assert_eq!(format!("{}", err), msg) - } +impl From for RunnerError { + fn from(item: IOError) -> Self { + RunnerError::RunnerIOError(item) } } diff --git a/performance/runner/src/fs.rs b/performance/runner/src/fs.rs new file mode 100644 index 00000000000..7f4ee138529 --- /dev/null +++ b/performance/runner/src/fs.rs @@ -0,0 +1,349 @@ +use crate::exceptions::{IOError, RunnerError}; +use crate::types::*; +use chrono::prelude::*; +use serde::de::DeserializeOwned; +use std::fs::DirEntry; +use std::io; +use std::path::{Path, PathBuf}; +use std::process::{Command, ExitStatus}; +use std::str::FromStr; +use std::{cmp, fs}; + +// TODO these should not be defined here anymore. they need to be split at the github action level. +// To add a new metric to the test suite, simply define it in this list +static METRICS: [HyperfineCmd; 1] = [HyperfineCmd { + name: "parse", + prepare: "dbt clean", + cmd: "dbt parse --no-version-check", +}]; + +pub fn from_json_files( + dir: &dyn AsRef, +) -> Result, RunnerError> { + let contents = file_contents_from(dir, "json")?; + let (paths, strings): (Vec, Vec) = contents.iter().cloned().unzip(); + let ts = map_deserialize(&strings)?; + Ok(paths.into_iter().zip(ts).collect()) +} + +// Given a directory, read all files in the directory and return each +// filename with the deserialized json contents of that file. +pub fn map_deserialize(serialized: &[String]) -> Result, RunnerError> { + serialized + .into_iter() + .map(|contents| { + serde_json::from_str::(contents) + .or_else(|e| Err(RunnerError::BadJSONErr(contents.clone(), Some(e)))) + }) + .collect() +} + +// gets file contents from a directory for all files with a specific extension. +// great for deserializing with serde. +// `extension` should be everything after the dot. e.g. "json" +pub fn file_contents_from( + results_directory: &dyn AsRef, + extension: &str, +) -> Result, IOError> { + let entries: Vec = fs::read_dir(results_directory) + .or_else(|e| { + Err(IOError::ReadErr( + results_directory.as_ref().to_path_buf(), + Some(e), + )) + })? + .collect::, io::Error>>() + .or_else(|e| { + Err(IOError::ReadIterErr( + results_directory.as_ref().to_path_buf(), + Some(e), + )) + })?; + + entries + .into_iter() + .filter_map(|entry| { + let path = entry.path(); + let os_ext = path.extension()?; + let ext = os_ext.to_str()?; + if ext.ends_with(extension) { + Some(path) + } else { + None + } + }) + .map(|path| { + fs::read_to_string(&path) + .map(|contents| (path.clone(), contents)) + .or_else(|e| Err(IOError::BadFileContentsErr(path.clone(), Some(e)))) + }) + .collect() +} + +// TODO this should read the commands to run on each project from the project definitions themselves +// not from a hard coded array in this file. +fn get_projects<'a>( + projects_directory: &dyn AsRef, +) -> Result)>, IOError> { + let entries = fs::read_dir(projects_directory).or_else(|e| { + Err(IOError::ReadErr( + projects_directory.as_ref().to_path_buf(), + Some(e), + )) + })?; + + let results: Vec<(PathBuf, String, HyperfineCmd<'a>)> = entries + .map(|entry| { + let path = entry + .or_else(|e| { + Err(IOError::ReadErr( + projects_directory.as_ref().to_path_buf(), + Some(e), + )) + })? + .path(); + + let project_name: String = path + .file_name() + .ok_or_else(|| IOError::MissingFilenameErr(path.clone().to_path_buf())) + .and_then(|x| { + x.to_str() + .ok_or_else(|| IOError::FilenameNotUnicodeErr(path.clone().to_path_buf())) + })? + .to_owned(); + + // each project-metric pair we will run + // TODO maybe not every command should run on every project. maybe have these defined as files in the project directories? + let pairs = METRICS + .iter() + .map(|metric| (path.clone(), project_name.clone(), metric.clone())) + .collect::)>>(); + + Ok(pairs) + }) + .collect::)>>, IOError>>()? + .concat(); + + Ok(results) +} + +// reads directory names under this directory and converts their names to sem-ver versions. +// returns the latest version of the set. +// +// this is used to identify which baseline version we should be targeting to compare samples against. +pub fn latest_version_from(dir: &dyn AsRef) -> Result { + let versions = all_dirs_in(dir)? + .into_iter() + // this line is a little opaque but it's just converting OsStr -> String with options along the way. + .filter_map(|d| { + d.file_name() + .and_then(|fname| fname.to_str().map(|x| x.to_string())) + }) + .map(|fname| Version::from_str(&fname)) + .collect::, RunnerError>>()?; + + versions + .into_iter() + .reduce(cmp::max) + .ok_or_else(|| RunnerError::NoVersionedBaselineData(dir.as_ref().to_path_buf())) +} + +fn all_dirs_in(dir: &dyn AsRef) -> Result, IOError> { + Ok(fs::read_dir(dir) + .or_else(|e| Err(IOError::ReadErr(dir.as_ref().to_path_buf(), Some(e))))? + .collect::, io::Error>>() + .or_else(|e| Err(IOError::ReadIterErr(dir.as_ref().to_path_buf(), Some(e))))? + .into_iter() + .filter_map(|d| { + let path = d.path(); + if path.is_dir() { + Some(path) + } else { + None + } + }) + .collect()) +} + +// TODO can we call hyperfine as a rust library? +// https://crates.io/crates/hyperfine/1.13.0 +fn run_hyperfine( + run_dir: &dyn AsRef, + command: &str, + prep: &str, + runs: i32, + output_file: &dyn AsRef, +) -> Result { + Command::new("hyperfine") + .current_dir(run_dir) + // warms filesystem caches by running the command first without counting it. + // alternatively we could clear them before each run + .arg("--warmup") + .arg("1") + // --min-runs defaults to 10 + .arg("--min-runs") + .arg(runs.to_string()) + .arg("--max-runs") + .arg(runs.to_string()) + .arg("--prepare") + .arg(prep) + .arg(command) + .arg("--export-json") + .arg(output_file.as_ref()) + // this prevents hyperfine from capturing dbt's output. + // Noisy, but good for debugging when tests fail. + .arg("--show-output") + .status() // use spawn() here instead for more information + .or_else(|e| Err(IOError::CommandErr(Some(e)))) +} + +// Attempt to delete the directory and its contents. If it doesn't exist we'll just recreate it anyway. +fn clear_dir(dir: &dyn AsRef) -> Result<(), io::Error> { + match fs::remove_dir_all(dir) { + // whether it existed or not, create the directory and any missing parent dirs. + _ => fs::create_dir_all(dir), + } +} + +// deletes the output directory, makes one hyperfine run for each project-metric pair, +// reads in the results, and returns a Sample for each project-metric pair. +pub fn take_samples( + projects_dir: &dyn AsRef, + out_dir: &dyn AsRef, +) -> Result, RunnerError> { + clear_dir(out_dir).or_else(|e| { + Err(IOError::CannotRecreateTempDirErr( + out_dir.as_ref().to_path_buf(), + e, + )) + })?; + + // using one time stamp for all samples. + let ts = Utc::now(); + + // run hyperfine in serial for each project-metric pair + for (path, project_name, hcmd) in get_projects(projects_dir)? { + let metric = Metric { + name: hcmd.name.to_owned(), + project_name: project_name.to_owned(), + }; + + let command = format!("{} --profiles-dir ../../project_config/", hcmd.cmd); + let mut output_file = out_dir.as_ref().to_path_buf(); + output_file.push(metric.filename()); + + // TODO we really want one run, not two. Right now the second is discarded. so we might not want to use hyperfine for taking samples. + let status = run_hyperfine(&path, &command, hcmd.clone().prepare, 2, &output_file) + .or_else(|e| Err(RunnerError::from(e)))?; + + match status.code() { + Some(code) if code != 0 => return Err(RunnerError::HyperfineNonZeroExitCode(code)), + _ => (), + } + } + + let samples = from_json_files::(out_dir)? + .into_iter() + .map(|(path, measurement)| { + Sample::from_measurement( + &path, + &measurement.results[0], // TODO if its empty it'll panic. + ts, + ) + }) + .collect::, RunnerError>>()?; + + Ok(samples) +} + +// Calls hyperfine via system command, reads in the results, and writes out Baseline json files. +// Intended to be called after each new version is released. +pub fn model<'a>( + version: Version, + projects_directory: &dyn AsRef, + out_dir: &dyn AsRef, + tmp_dir: &dyn AsRef, + n_runs: i32, +) -> Result, RunnerError> { + for (path, project_name, hcmd) in get_projects(projects_directory)? { + let metric = Metric { + name: hcmd.name.to_owned(), + project_name: project_name.to_owned(), + }; + + let command = format!("{} --profiles-dir ../../project_config/", hcmd.clone().cmd); + let mut tmp_file = tmp_dir.as_ref().to_path_buf(); + tmp_file.push(metric.filename()); + + let status = run_hyperfine(&path, &command, hcmd.clone().prepare, n_runs, &tmp_file) + .or_else(|e| Err(RunnerError::from(e)))?; + + match status.code() { + Some(code) if code != 0 => return Err(RunnerError::HyperfineNonZeroExitCode(code)), + _ => (), + } + } + + // read what hyperfine wrote + let measurements: Vec<(PathBuf, Measurements)> = from_json_files::(tmp_dir)?; + + // put it in the right format using the same timestamp for every model. + let now = Utc::now(); + let baselines: Vec = measurements + .into_iter() + .map(|m| { + let (path, measurement) = m; + from_measurement(version, path, measurement, now) + }) + .collect::, RunnerError>>()?; + + // write a file for each baseline measurement + for model in &baselines { + // create the correct filename like `/out_dir/1.0.0/parse___2000_models.json` + let mut out_file = out_dir.as_ref().to_path_buf(); + out_file.push(version.to_string()); + + // write out the version directory. ignore errors since if it's already made that's fine. + match fs::create_dir(out_file.clone()) { + _ => (), + }; + + // continue creating the correct filename + out_file.push(model.metric.filename()); + out_file.set_extension("json"); + + // get the serialized string + let s = serde_json::to_string(&model).or_else(|e| Err(RunnerError::SerializationErr(e)))?; + + // TODO writing files in _this function_ isn't the most graceful way organize the code. + // write the newly modeled baseline to the above path + fs::write(out_file.clone(), s) + .or_else(|e| Err(IOError::WriteErr(out_file.clone(), Some(e))))?; + } + + Ok(baselines) +} + +// baseline filenames are expected to encode the metric information +fn from_measurement( + version: Version, + path: PathBuf, + measurements: Measurements, + // forcing time to be provided so that uniformity of time stamps across a set of baselines is more explicit + ts: DateTime, +) -> Result { + // `file_name` is boop___proj.json. `file_stem` is boop___proj. + let filestem = path.file_stem().map_or_else( + || Err(IOError::BadFilestemError(path.clone())), + |stem| Ok(stem.to_string_lossy().to_string()), + )?; + + let metric = Metric::from_str(&filestem)?; + + Ok(Baseline { + version: version, + metric: metric, + ts: ts, + measurement: measurements.results[0].clone(), + }) +} diff --git a/performance/runner/src/main.rs b/performance/runner/src/main.rs index 6ab83e33338..ccc508960bc 100644 --- a/performance/runner/src/main.rs +++ b/performance/runner/src/main.rs @@ -2,14 +2,11 @@ extern crate structopt; mod calculate; mod exceptions; -mod measure; +mod fs; +mod types; -use crate::calculate::Calculation; -use crate::exceptions::CalculateError; -use chrono::offset::Utc; -use std::fs::metadata; -use std::fs::File; -use std::io::Write; +use crate::exceptions::RunnerError; +use crate::types::{Calculation, Version}; use std::path::PathBuf; use structopt::StructOpt; @@ -18,19 +15,30 @@ use structopt::StructOpt; #[derive(Clone, Debug, StructOpt)] #[structopt(name = "performance", about = "performance regression testing runner")] enum Opt { - #[structopt(name = "measure")] - Measure { + #[structopt(name = "model")] + Model { + #[structopt(short)] + version: Version, #[structopt(parse(from_os_str))] #[structopt(short)] projects_dir: PathBuf, + #[structopt(parse(from_os_str))] + #[structopt(short)] + baselines_dir: PathBuf, + #[structopt(parse(from_os_str))] + #[structopt(short)] + tmp_dir: PathBuf, #[structopt(short)] - branch_name: String, + n_runs: i32, }, - #[structopt(name = "calculate")] - Calculate { + #[structopt(name = "sample")] + Sample { #[structopt(parse(from_os_str))] #[structopt(short)] - results_dir: PathBuf, + projects_dir: PathBuf, + #[structopt(parse(from_os_str))] + #[structopt(short)] + baseline_dir: PathBuf, #[structopt(parse(from_os_str))] #[structopt(short)] out_dir: PathBuf, @@ -42,45 +50,44 @@ enum Opt { // // This is where all the printing should happen. Exiting happens // in main, and module functions should only return values. -fn run_app() -> Result { +fn run_app() -> Result { // match what the user inputs from the cli match Opt::from_args() { - // measure subcommand - Opt::Measure { + // model subcommand + Opt::Model { + version, projects_dir, - branch_name, + baselines_dir, + tmp_dir, + n_runs, } => { + // note: I tried resolving relative paths here, and I couldn't get it to work. + // this means the cli requires absolute paths for now. + // if there are any nonzero exit codes from the hyperfine runs, // return the first one. otherwise return zero. - measure::measure(&projects_dir, &branch_name) - .or_else(|e| Err(CalculateError::CalculateIOError(e)))? - .iter() - .map(|status| status.code()) - .flatten() - .filter(|code| *code != 0) - .collect::>() - .get(0) - .map_or(Ok(0), |x| { - println!("Main: a child process exited with a nonzero status code."); - Ok(*x) - }) + let baseline = fs::model(version, &projects_dir, &baselines_dir, &tmp_dir, n_runs)?; + + // print the results to the console for viewing in CI + println!(":: Modeling Results ::"); + let s = serde_json::to_string_pretty(&baseline) + .or_else(|e| Err(RunnerError::SerializationErr(e)))?; + println!("{}", s); + + Ok(0) } - // calculate subcommand - Opt::Calculate { - results_dir, + // samples performance characteristics from the current commit + // and compares them against the model for the latest version in this branch + // prints all sample results and exits with non-zero exit code + // when a regression is suspected + Opt::Sample { + projects_dir, + baseline_dir, out_dir, } => { - // validate output directory and exit early if it won't work. - let md = metadata(&out_dir) - .expect("Main: Failed to read specified output directory metadata. Does it exist?"); - if !md.is_dir() { - eprintln!("Main: Output directory is not a directory"); - return Ok(1); - } - // get all the calculations or gracefully show the user an exception - let calculations = calculate::regressions(&results_dir)?; + let calculations = calculate::regressions(&baseline_dir, &projects_dir, &out_dir)?; // print all calculations to stdout so they can be easily debugged // via CI. @@ -89,26 +96,6 @@ fn run_app() -> Result { println!("{:#?}\n", c); } - // indented json string representation of the calculations array - let json_calcs = serde_json::to_string_pretty(&calculations) - .expect("Main: Failed to serialize calculations to json"); - - // if there are any calculations, use the first timestamp, if there are none - // just use the current time. - let ts = calculations - .first() - .map_or_else(|| Utc::now(), |calc| calc.ts); - - // create the empty destination file, and write the json string - let outfile = &mut out_dir.into_os_string(); - outfile.push("/final_calculations_"); - outfile.push(ts.timestamp().to_string()); - outfile.push(".json"); - - let mut f = File::create(outfile).expect("Main: Unable to create file"); - f.write_all(json_calcs.as_bytes()) - .expect("Main: Unable to write data"); - // filter for regressions let regressions: Vec<&Calculation> = calculations.iter().filter(|c| c.regression).collect(); diff --git a/performance/runner/src/measure.rs b/performance/runner/src/measure.rs deleted file mode 100644 index c2e26ca277d..00000000000 --- a/performance/runner/src/measure.rs +++ /dev/null @@ -1,92 +0,0 @@ -use crate::exceptions::IOError; -use std::fs; -use std::path::PathBuf; -use std::process::{Command, ExitStatus}; - -// `Metric` defines a dbt command that we want to measure on both the -// baseline and dev branches. -#[derive(Debug, Clone)] -struct Metric<'a> { - name: &'a str, - prepare: &'a str, - cmd: &'a str, -} - -impl Metric<'_> { - // Returns the proper filename for the hyperfine output for this metric. - fn outfile(&self, project: &str, branch: &str) -> String { - [branch, "_", self.name, "_", project, ".json"].join("") - } -} - -// Calls hyperfine via system command, and returns all the exit codes for each hyperfine run. -pub fn measure<'a>( - projects_directory: &PathBuf, - dbt_branch: &str, -) -> Result, IOError> { - /* - Strategy of this function body: - 1. Read all directory names in `projects_directory` - 2. Pair `n` projects with `m` metrics for a total of n*m pairs - 3. Run hyperfine on each project-metric pair - */ - - // To add a new metric to the test suite, simply define it in this list: - // TODO: This could be read from a config file in a future version. - let metrics: Vec = vec![Metric { - name: "parse", - prepare: "rm -rf target/", - cmd: "dbt parse --no-version-check", - }]; - - fs::read_dir(projects_directory) - .or_else(|e| Err(IOError::ReadErr(projects_directory.to_path_buf(), Some(e))))? - .map(|entry| { - let path = entry - .or_else(|e| Err(IOError::ReadErr(projects_directory.to_path_buf(), Some(e))))? - .path(); - - let project_name: String = path - .file_name() - .ok_or_else(|| IOError::MissingFilenameErr(path.clone().to_path_buf())) - .and_then(|x| { - x.to_str() - .ok_or_else(|| IOError::FilenameNotUnicodeErr(path.clone().to_path_buf())) - })? - .to_owned(); - - // each project-metric pair we will run - let pairs = metrics - .iter() - .map(|metric| (path.clone(), project_name.clone(), metric)) - .collect::)>>(); - - Ok(pairs) - }) - .collect::)>>, IOError>>()? - .concat() - .iter() - // run hyperfine on each pairing - .map(|(path, project_name, metric)| { - Command::new("hyperfine") - .current_dir(path) - // warms filesystem caches by running the command first without counting it. - // alternatively we could clear them before each run - .arg("--warmup") - .arg("1") - // --min-runs defaults to 10 - .arg("--min-runs") - .arg("20") - .arg("--prepare") - .arg(metric.prepare) - .arg([metric.cmd, " --profiles-dir ", "../../project_config/"].join("")) - .arg("--export-json") - .arg(["../../results/", &metric.outfile(project_name, dbt_branch)].join("")) - // this prevents hyperfine from capturing dbt's output. - // Noisy, but good for debugging when tests fail. - .arg("--show-output") - .status() // use spawn() here instead for more information - .or_else(|e| Err(IOError::CommandErr(Some(e)))) - }) - .collect() -} diff --git a/performance/runner/src/types.rs b/performance/runner/src/types.rs new file mode 100644 index 00000000000..235558224e1 --- /dev/null +++ b/performance/runner/src/types.rs @@ -0,0 +1,221 @@ +use crate::exceptions::{IOError, RunnerError}; +use chrono::prelude::*; +use serde::{Deserialize, Serialize}; +use serde_with::{DeserializeFromStr, SerializeDisplay}; +use std::fmt::Display; +use std::path::PathBuf; +use std::str::FromStr; +use std::{cmp, fmt}; + +// `HyperfineCmd` defines a command that we want to measure with hyperfine +#[derive(Debug, Clone)] +pub struct HyperfineCmd<'a> { + pub name: &'a str, + pub prepare: &'a str, + pub cmd: &'a str, +} + +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, Hash)] +pub struct Metric { + pub name: String, + pub project_name: String, +} + +impl FromStr for Metric { + type Err = RunnerError; + + fn from_str(s: &str) -> Result { + let split: Vec<&str> = s.split(Metric::sep()).collect(); + match &split[..] { + [name, project] => Ok(Metric { + name: name.to_string(), + project_name: project.to_string(), + }), + _ => Err(RunnerError::MetricParseFail(s.to_owned())), + } + } +} + +impl Metric { + pub fn sep() -> &'static str { + "___" + } + + // encodes the metric name and project in the filename for the hyperfine output. + pub fn filename(&self) -> String { + format!("{}{}{}.json", self.name, Metric::sep(), self.project_name) + } +} + +// This type exactly matches the type of array elements +// from hyperfine's output. Deriving `Serialize` and `Deserialize` +// gives us read and write capabilities via json_serde. +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] +pub struct Measurement { + pub command: String, + pub mean: f64, + pub stddev: f64, + pub median: f64, + pub user: f64, + pub system: f64, + pub min: f64, + pub max: f64, + pub times: Vec, +} + +// This type exactly matches the type of hyperfine's output. +// Deriving `Serialize` and `Deserialize` gives us read and +// write capabilities via json_serde. +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] +pub struct Measurements { + pub results: Vec, +} + +// struct representation for "major.minor.patch" version. +// useful for ordering versions to get the latest +#[derive( + Debug, Copy, Clone, Eq, PartialEq, Ord, PartialOrd, Hash, DeserializeFromStr, SerializeDisplay, +)] +pub struct Version { + pub major: i32, + pub minor: i32, + pub patch: i32, +} + +impl FromStr for Version { + type Err = RunnerError; + + // TODO: right now this only parses versions in the form "1.1.1" + // but it could also easily also parse the form "v1.1.1" so dropping the v + // doesn't have to be done in the action. + fn from_str(s: &str) -> Result { + let ints: Vec = s + .split(".") + .map(|x| x.parse::()) + .collect::, ::Err>>() + .or_else(|_| Err(RunnerError::VersionParseFail(s.to_owned())))?; + + match ints[..] { + [major, minor, patch] => Ok(Version { + major: major, + minor: minor, + patch: patch, + }), + _ => Err(RunnerError::VersionParseFail(s.to_owned())), + } + } +} + +impl Version { + #[cfg(test)] + pub fn new(major: i32, minor: i32, patch: i32) -> Version { + Version { + major: major, + minor: minor, + patch: patch, + } + } +} + +// A model for a single project-command pair +// modeling a version at release time will populate a directory with many of these +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] +pub struct Baseline { + pub version: Version, + pub metric: Metric, + pub ts: DateTime, + pub measurement: Measurement, +} + +impl PartialOrd for Baseline { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.version.cmp(&other.version)) + } +} + +// A JSON structure outputted by the release process that contains +// a history of all previous version baseline measurements. +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] +pub struct Sample { + pub metric: Metric, + pub value: f64, + pub ts: DateTime, +} + +impl Sample { + pub fn from_measurement( + path: &PathBuf, + measurement: &Measurement, + ts: DateTime, + ) -> Result { + // `file_name` is boop___proj.json. `file_stem` is boop___proj. + let filestem = path.file_stem().map_or_else( + || Err(IOError::BadFilestemError(path.clone())), + |stem| Ok(stem.to_string_lossy().to_string()), + )?; + + let metric = Metric::from_str(&filestem)?; + + // TODO use result values not panics + match &measurement.times[..] { + [] => panic!("found a sample with no measurement"), + [x] => Ok(Sample { + metric: metric, + value: *x, + ts: ts, + }), + // TODO this is only while we're taking two runs at a time. should be one. + [x, _] => Ok(Sample { + metric: metric, + value: *x, + ts: ts, + }), + _ => panic!("found a sample with too many measurements!"), + } + } +} + +// The full output from a comparison between runs on the baseline +// and dev branches. +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] +pub struct Calculation { + pub version: Version, + pub metric: Metric, + pub regression: bool, + pub ts: DateTime, + pub sigma: f64, + pub mean: f64, + pub stddev: f64, + pub threshold: f64, + pub sample: f64, +} + +// This display instance is used to derive Serialize as well via `SerializeDisplay` +impl Display for Version { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "{}.{}.{}", self.major, self.minor, self.patch) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn assert_version_order() { + // for each pair, assert that the left one is bigger than the right one + let pairs = [ + ((1, 0, 0), (0, 20, 0)), + ((1, 0, 1), (1, 0, 0)), + ((1, 0, 9), (0, 20, 0)), + ((1, 0, 9), (0, 0, 4)), + ((1, 1, 0), (1, 0, 99)), + ]; + + for (big, small) in pairs { + let bigv = Version::new(big.0, big.1, big.2); + let smallv = Version::new(small.0, small.1, small.2); + assert!(cmp::max(bigv, smallv) == bigv) + } + } +}