vwxyzjn · vwxyzjn · Oct 31, 2022 · Oct 31, 2022 · Oct 31, 2022 · Nov 1, 2022
diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,5 @@
+compare.pdf
+compare.png
 balance_bot.xml
 cleanrl/ppo_continuous_action_isaacgym/isaacgym/examples
 cleanrl/ppo_continuous_action_isaacgym/isaacgym/isaacgym

diff --git a/cleanrl_utils/rlops.py b/cleanrl_utils/rlops.py
@@ -0,0 +1,186 @@
+import argparse
+from distutils.util import strtobool
+from typing import List
+
+import expt
+import matplotlib.pyplot as plt
+import numpy as np
+import wandb
+import wandb.apis.reports as wb  # noqa
+from expt import Hypothesis, Run
+from expt.plot import GridPlot
+
+wandb.require("report-editing")
+api = wandb.Api()
+
+
+def parse_args():
+    # fmt: off
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--exp-name", type=str, default="ddpg_continuous_action_jax",
+        help="the name of this experiment")
+    parser.add_argument("--wandb-project-name", type=str, default="cleanrl",
+        help="the wandb's project name")
+    parser.add_argument("--wandb-entity", type=str, default="openrlbenchmark",
+        help="the entity (team) of wandb's project")
+    parser.add_argument("--tags", nargs="+", default=["v1.0.0b2-9-g4605546", "rlops-pilot"],
+        help="the tags of the runsets")
+    parser.add_argument("--env-ids", nargs="+", default=["Hopper-v2", "Walker2d-v2", "HalfCheetah-v2"],
+        help="the ids of the environment to compare")
+    parser.add_argument("--output-filename", type=str, default="compare.png",
+        help="the output filename of the plot")
+    parser.add_argument("--report", type=lambda x: bool(strtobool(x)), default=False, nargs="?", const=True,
+        help="if toggled, a wandb report will be created")
+    # fmt: on
+    return parser.parse_args()
+
+
+def create_hypothesis(name: str, wandb_runs: List[wandb.apis.public.Run]) -> Hypothesis:
+    runs = []
+    for idx, run in enumerate(wandb_runs):
+        wandb_run = run.history()
+        if "videos" in wandb_run:
+            wandb_run = wandb_run.drop(columns=["videos"], axis=1)
+        runs += [Run(f"seed{idx}", wandb_run)]
+    return Hypothesis(name, runs)
+
+
+class Runset:
+    def __init__(self, name: str, filters: dict, entity: str, project: str, groupby: str = ""):
+        self.name = name
+        self.filters = filters
+        self.entity = entity
+        self.project = project
+        self.groupby = groupby
+
+    @property
+    def runs(self):
+        return wandb.Api().runs(path=f"{self.entity}/{self.project}", filters=self.filters)
+
+    @property
+    def report_runset(self):
+        return wb.RunSet(
+            name=self.name,
+            entity=self.entity,
+            project=self.project,
+            filters={"$or": [self.filters]},
+            groupby=[self.groupby] if len(self.groupby) > 0 else None,
+        )
+
+
+def compare(
+    runsetss: List[List[Runset]],
+    env_ids: List[str],
+    ncols: int,
+    output_filename: str = "compare.png",
+):
+    blocks = []
+    for idx, env_id in enumerate(env_ids):
+        blocks += [
+            wb.PanelGrid(
+                runsets=[runsets[idx].report_runset for runsets in runsetss],
+                panels=[
+                    wb.LinePlot(
+                        x="global_step",
+                        y=["charts/episodic_return"],
+                        title=env_id,
+                        title_x="Steps",
+                        title_y="Episodic Return",
+                        max_runs_to_show=100,
+                        smoothing_factor=0.8,
+                        groupby_rangefunc="stderr",
+                        legend_template="${runsetName}",
+                    ),
+                    wb.LinePlot(
+                        x="_runtime",
+                        y=["charts/episodic_return"],
+                        title=env_id,
+                        title_y="Episodic Return",
+                        max_runs_to_show=100,
+                        smoothing_factor=0.8,
+                        groupby_rangefunc="stderr",
+                        legend_template="${runsetName}",
+                    ),
+                    # wb.MediaBrowser(
+                    #     num_columns=2,
+                    #     media_keys="videos",
+                    # ),
+                ],
+            ),
+        ]
+
+    nrows = np.ceil(len(env_ids) / ncols).astype(int)
+    figsize = (ncols * 4, nrows * 3)
+    fig, axes = plt.subplots(
+        nrows=nrows,
+        ncols=ncols,
+        figsize=figsize,
+        # sharex=True,
+        # sharey=True,
+    )
+
+    for idx, env_id in enumerate(env_ids):
+        ex = expt.Experiment("Comparison")
+        for runsets in runsetss:
+            h = create_hypothesis(runsets[idx].name, runsets[idx].runs)
+            ex.add_hypothesis(h)
+        ax = axes.flatten()[idx]
+        ex.plot(
+            ax=ax,
+            title=env_id,
+            x="_runtime",
+            y="charts/episodic_return",
+            err_style="band",
+            std_alpha=0.1,
+            rolling=50,
+            n_samples=400,
+            legend=False,
+        )
+
+    h, l = ax.get_legend_handles_labels()
+    fig.legend(h, l, loc="upper center", ncol=2)
+    fig.subplots_adjust(top=0.9)
+    # remove the empty axes
+    for ax in axes.flatten()[len(env_ids) :]:
+        ax.remove()
+
+    print(f"saving figure to {output_filename}")
+    plt.savefig(f"{output_filename}", bbox_inches="tight")
+    plt.savefig(f"{output_filename.replace('.png', '.pdf')}", bbox_inches="tight")
+    return blocks
+
+
+if __name__ == "__main__":
+    args = parse_args()
+
+    g = GridPlot(y_names=args.env_ids)
+    blocks = []
+    runsetss = []
+    for tag in args.tags:
+        runsets = []
+        for env_id in args.env_ids:
+            runsets += [
+                Runset(
+                    name=f"CleanRL's {args.exp_name} ({tag})",
+                    filters={
+                        "$and": [{"config.env_id.value": env_id}, {"tags": tag}, {"config.exp_name.value": args.exp_name}]
+                    },
+                    entity=args.wandb_entity,
+                    project=args.wandb_project_name,
+                    groupby="exp_name",
+                )
+            ]
+            print(f"CleanRL's {args.exp_name} ({tag}) in {env_id} has {len(runsets[0].runs)} runs")
+            assert len(runsets[0].runs) > 0, f"CleanRL's {args.exp_name} ({tag}) in {env_id} has no runs"
+        runsetss += [runsets]
+
+    blocks = compare(runsetss, args.env_ids, output_filename="compare.png", ncols=2)
+    if args.report:
+        print("saving report")
+        report = wb.Report(
+            project="cleanrl",
+            title=f"Regression Report: {args.exp_name} ({args.tags})",
+            blocks=blocks,
+        )
+        report.save()
+        print(f"view the generated report at {report.url}")
diff --git a/cleanrl_utils/rlops_tags.py b/cleanrl_utils/rlops_tags.py
@@ -0,0 +1,37 @@
+import argparse
+
+import wandb
+
+api = wandb.Api()
+
+
+def parse_args():
+    # fmt: off
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--wandb-project-name", type=str, default="cleanrl",
+        help="the wandb's project name")
+    parser.add_argument("--wandb-entity", type=str, default="openrlbenchmark",
+        help="the entity (team) of wandb's project")
+
+    parser.add_argument("--add", type=str, default="", 
+        help="the tag to be added to any runs with the `--source-tag`")
+    parser.add_argument("--remove", type=str, default="", 
+        help="the tag to be removed from any runs with the `--source-tag`")
+    parser.add_argument("--source-tag", type=str, default="v1.0.0b2-7-g4bb6766",
+        help="the source tag of the set of runs")
+    # fmt: on
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    print(args)
+    runs = api.runs(path=f"{args.wandb_entity}/{args.wandb_project_name}", filters={"tags": {"$in": [args.source_tag]}})
+    for run in runs:
+        tags = run.tags
+        if args.add:
+            tags.append(args.add)
+        if args.remove:
+            tags.remove(args.remove)
+        run.tags = tags
+        run.update()
diff --git a/docs/advanced/rlops.md b/docs/advanced/rlops.md
@@ -0,0 +1,61 @@
+# RLops
+
+This document describes how to we do "RLops" to validate new features / bug fixes and avoid introducing regressions.
+
+
+## Background
+DRL is brittle and has a series of reproducibility issues — even bug fixes sometimes could introduce performance regression (e.g., see [how a bug fix of contact force in MuJoCo results in worse performance for PPO](https://github.com/openai/gym/pull/2762#discussion_r853488897)). Therefore, it is essential to understand how the proposed changes impact the performance of the algorithms. At large, we wish to distinguish two types of contributions: 1) **non-performance-impacting changes** and 2) **performance-impacting changes**. 
+
+* **non-performance-impacting changes**: this type of change does *not* impact the performance of the algorithm, such as documentation fixes (#282), renaming variables (#257), and removing unused code (#287). For this type of change, we can easily merge them without worrying too much about the consequences.
+* **performance-impacting changes**: this type of change impacts the algorithm's performance. Examples include making a slight modification to the `gamma` parameter in PPO  (https://github.com/vwxyzjn/cleanrl/pull/209), properly handling action bounds in DDPG (https://github.com/vwxyzjn/cleanrl/pull/211), and fixing bugs (https://github.com/vwxyzjn/cleanrl/pull/281)
+
+
+**Importantly, regardless of the slight difference in performance-impacting changes, we need to re-run the benchmark to ensure there is no regression**. This post proposes a way for us to re-run the model and check regression seamlessly.
+
+## Methodology
+
+
+### (Step 1) Run the benchmark
+
+We usually ran the benchmark experiments through [`benchmark.py`](https://github.com/vwxyzjn/cleanrl/blob/master/cleanrl_utils/benchmark.py), such as the following:
+
+```bash
+poetry install
+OMP_NUM_THREADS=1 xvfb-run -a poetry run python -m cleanrl_utils.benchmark \
+    --env-ids CartPole-v1 Acrobot-v1 MountainCar-v0 \
+    --command "poetry run python cleanrl/ppo.py --cuda False --track --capture-video" \
+    --num-seeds 3 \
+    --workers 9
+```
+
+under the hood, this script will invoke an `autotag` feature that tries to tag the the experiments with version control information, such as the git tag (e.g., `v1.0.0b1-4-g4ea73d9`) and the github PR number (e.g., `pr-308`). This is useful for us to compare the performance of the same algorithm across different versions.
+
+
+### (Step 2) Regression check
+
+Let's say our latest experiments is tagged with `v1.0.0b2-9-g4605546`. We can then run the following command to compare its performance with the the current version `latest`:
+
+
+```bash
+python rlops.py --exp-name ddpg_continuous_action_jax \
+    --wandb-project-name cleanrl \
+    --wandb-entity openrlbenchmark \
+    --tags v1.0.0b2-9-g4605546 rlops-pilot \
+    --env-ids Hopper-v2 Walker2d-v2 HalfCheetah-v2 \
+    --output-filename compare.png \
+    --report
+```
+which could generate wandb reports with the following figure and corresponding tables.
+
+<img width="1195" alt="image" src="https://user-images.githubusercontent.com/5555347/196775462-2ef25c47-72dd-426d-88b8-9d74e5062936.png">
+
+
+### (Step 3) Merge the PR
+
+Once we confirm there is no regression in the performance, we can merge the PR. Furthermore, we will label the new experiments as `latest` (and remove the tag `latest` for `v1.0.0b2-7-gxfd3d3` correspondingly. 
+
+```bash
+python rlops_tags.py --add latest --source-tag v1.0.0b2-9-g4605546
+python rlops_tags.py --remove latest --source-tag rlops-pilot
+```
+```