Merge pull request #85 from aai-institute/add-bq-example

Add BQ example to showcase cloud streaming of benchmark records
aai-institute · Feb 22, 2024 · 638e35a · 638e35a
2 parents c2be53f + 1d8ce84
commit 638e35a
Show file tree

Hide file tree

Showing 6 changed files with 122 additions and 5 deletions.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -12,15 +12,14 @@ repos:
   - repo: https://github.com/pre-commit/mirrors-mypy
     rev: v1.8.0
     hooks:
-      # See https://github.com/pre-commit/mirrors-mypy/blob/main/.pre-commit-hooks.yaml
      - id: mypy
        types_or: [ python, pyi ]
-       args: [--ignore-missing-imports, --scripts-are-modules]
+       args: [--ignore-missing-imports, --explicit-package-bases]
   - repo: https://github.com/astral-sh/ruff-pre-commit
     rev: v0.2.2
     hooks:
       - id: ruff
-        args: [ --fix, --exit-non-zero-on-fix ]
+        args: [--fix, --exit-non-zero-on-fix]
       - id: ruff-format
   - repo: https://github.com/PyCQA/bandit
     rev: 1.7.7

diff --git a/docs/tutorials/bq.md b/docs/tutorials/bq.md
@@ -0,0 +1,75 @@
+# Streaming benchmarks to a cloud database
+
+Once you obtain the results of your benchmarks, you will most likely want to store them somewhere.
+Whether that is in storage as flat files, on a server, or in a database, `nnbench` allows you to write records anywhere, provided the destination supports JSON.
+
+This is a small guide containing a snippet on how to stream benchmark results to a Google Cloud BigQuery table.
+
+## The benchmarks
+
+Configure your benchmarks as normal, for example by separating them into a Python file.
+The following is a very simple example benchmark setup.
+
+```python
+--8<-- "examples/bq/benchmarks.py"
+```
+
+## Setting up a BigQuery client
+
+In order to authenticate with BigQuery, follow the official [Google Cloud documentation](https://cloud.google.com/bigquery/docs/authentication#client-libs).
+In this case, we rely on Application Default Credentials (ADC), which can be configured with the `gcloud` CLI.
+
+To interact with BigQuery from Python, the `google-cloud-bigquery` package has to be installed.
+You can do this e.g. using pip via `pip install --upgrade google-cloud-bigquery`.
+
+## Creating a table
+
+Within your configured project, proceed by creating a destination table to write the benchmarks to.
+Consider the [BigQuery Python documentation on tables](https://cloud.google.com/bigquery/docs/tables#python) for how to create a table programmatically.
+
+!!! Note
+    If the configured dataset does not exist, you will have to create it as well, either programmatically via the `bigquery.Client.create_dataset` API or in the Google Cloud console.
+
+## Using BigQuery's schema auto-detection
+
+In order to skip tedious schema inference by hand, we can use BigQuery's [schema auto-detection from JSON records](https://cloud.google.com/bigquery/docs/schema-detect).
+All we have to do is configure a BigQuery load job to auto-detect the schema from the Python dictionaries in memory:
+
+```python
+--8<-- "examples/bq/bq.py:13:16"
+```
+
+After that, write and stream the compacted benchmark record directly to your destination table.
+In this example, we decide to flatten the benchmark context to be able to extract scalar context values directly from the result table using raw SQL queries.
+Note that you have to use a custom separator (an underscore `"_"` in this case) for the context data, since BigQuery does not allow dots in column names.
+
+```python
+--8<-- "examples/bq/bq.py:21:25"
+```
+
+!!! Tip
+    If you would like to save the context dictionary as a struct instead, use `mode = "inline"` in the call to `BenchmarkRecord.compact()`.
+
+And that's all! To check that the records appear as expected, you can now query the data e.g. like so:
+
+```python
+# check that the insert worked.
+query = f'SELECT name, value, time_ns, git_commit AS commit FROM {table_id}'
+r = client.query(query)
+for row in r.result():
+    print(r)
+```
+
+## Recap and the full source code
+
+In this tutorial, we
+
+1) defined and ran a benchmark workload using `nnbench`.
+2) configured a Google Cloud BigQuery client and a load job to insert benchmark records into a table, and
+3) inserted the records into the destination table.
+
+The full source code for this tutorial is included below, and also in the [nnbench repository](https://github.com/aai-institute/nnbench/tree/main/examples/bq).
+
+```python
+--8<-- "examples/bq/bq.py"
+```
diff --git a/examples/bq/benchmarks.py b/examples/bq/benchmarks.py
@@ -0,0 +1,11 @@
+import nnbench
+
+
+@nnbench.benchmark
+def prod(a: int, b: int) -> int:
+    return a * b
+
+
+@nnbench.benchmark
+def sum(a: int, b: int) -> int:
+    return a + b
diff --git a/examples/bq/bq.py b/examples/bq/bq.py
@@ -0,0 +1,27 @@
+from google.cloud import bigquery
+
+import nnbench
+from nnbench.context import GitEnvironmentInfo
+
+
+def main():
+    client = bigquery.Client()
+
+    # TODO: Fill these out with your appropriate resource names.
+    table_id = "<PROJECT>.<DATASET>.<TABLE>"
+
+    job_config = bigquery.LoadJobConfig(
+        autodetect=True, source_format=bigquery.SourceFormat.NEWLINE_DELIMITED_JSON
+    )
+
+    runner = nnbench.BenchmarkRunner()
+    res = runner.run("benchmarks.py", params={"a": 1, "b": 1}, context=(GitEnvironmentInfo(),))
+
+    load_job = client.load_table_from_json(
+        res.compact(mode="flatten", sep="_"), table_id, job_config=job_config
+    )
+    load_job.result()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/mkdocs.yml b/mkdocs.yml
@@ -27,6 +27,7 @@ nav:
     - tutorials/mnist.md
     - tutorials/prefect.md
     - tutorials/streamlit.md
+    - tutorials/bq.md
   - API Reference: reference/
   - Contributing: CONTRIBUTING.md
 

diff --git a/src/nnbench/types.py b/src/nnbench/types.py
@@ -22,7 +22,9 @@ class BenchmarkRecord:
     benchmarks: list[dict[str, Any]]
 
     def compact(
-        self, mode: Literal["flatten", "inline", "omit"] = "inline"
+        self,
+        mode: Literal["flatten", "inline", "omit"] = "inline",
+        sep: str = ".",
     ) -> list[dict[str, Any]]:
         """
         Prepare the benchmark results, optionally inlining the context either as a
@@ -34,6 +36,8 @@ def compact(
             How to handle the context. ``"omit"`` leaves out the context entirely, ``"inline"``
             inserts it into the benchmark dictionary as a single entry named ``"context"``, and
             ``"flatten"`` inserts the flattened context values into the dictionary.
+        sep: str
+            The separator to use when flattening the context, i.e. when ``mode = "flatten"``.
 
         Returns
         -------
@@ -47,7 +51,7 @@ def compact(
             if mode == "inline":
                 b["context"] = self.context.data
             elif mode == "flatten":
-                flat = self.context.flatten()
+                flat = self.context.flatten(sep=sep)
                 b.update(flat)
                 b["_context_keys"] = tuple(self.context.keys())
         return self.benchmarks