diff --git a/.github/dependabot.yml b/.github/dependabot.yml index b2fa8a5a5..64020f34a 100644 --- a/.github/dependabot.yml +++ b/.github/dependabot.yml @@ -5,7 +5,7 @@ version: 2 updates: - # Maintain dependencies for GitHub Actions + # Maintain dependencies for GitHub Actions. - package-ecosystem: "github-actions" directory: "/" schedule: @@ -14,7 +14,7 @@ updates: - dependency-name: "*" update-types: ["version-update:semver-patch"] - # Maintain dependencies for npm + # Maintain dependencies for npm. - package-ecosystem: "npm" directory: "/" schedule: @@ -22,7 +22,7 @@ updates: ignore: - dependency-name: "*" update-types: ["version-update:semver-patch"] - # Maintain dependencies for npm + # Maintain dependencies for npm. - package-ecosystem: "pip" directory: "/" schedule: diff --git a/.github/workflows/stable_learning_control.yml b/.github/workflows/stable_learning_control.yml index 370f3a538..0418918eb 100644 --- a/.github/workflows/stable_learning_control.yml +++ b/.github/workflows/stable_learning_control.yml @@ -7,7 +7,7 @@ on: tags-ignore: - v*.*.* jobs: - markdown-lint: # Lints the markdown code + markdown-lint: # Lints the markdown code. name: runner / remark-lint runs-on: ubuntu-latest steps: @@ -18,7 +18,7 @@ jobs: github_token: ${{ secrets.GITHUB_TOKEN }} reporter: github-check level: warning - black: # Check python code format + black: # Check python code format. name: runner / black runs-on: ubuntu-latest steps: @@ -28,7 +28,7 @@ jobs: github_token: ${{ secrets.GITHUB_TOKEN }} reporter: github-check level: warning - flake8: # Lints python code + flake8: # Lints python code. name: runner / flake8 runs-on: ubuntu-latest steps: @@ -54,9 +54,9 @@ jobs: name: python-tests (Testing) runs-on: ubuntu-latest strategy: - fail-fast: false # Run all matrix jobs + fail-fast: false # Run all matrix jobs. matrix: - python-version: [3.8, 3.9, "3.10"] # Supported python versions + python-version: [3.8, 3.9, "3.10"] # Supported python versions. steps: - name: Checkout stable-learning-control repository uses: actions/checkout@v3 diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 5f3f29c93..2b1ddf8a1 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -15,7 +15,7 @@ jobs: level: warning exclude: | ./CHANGELOG.md - alex: # Checks docs for inconsiderate writing + alex: # Checks docs for inconsiderate writing. name: runner / alex runs-on: ubuntu-latest steps: @@ -25,7 +25,7 @@ jobs: github_token: ${{ secrets.GITHUB_TOKEN }} reporter: github-check level: warning - markdown-lint: # Lints the markdown code + markdown-lint: # Lints the markdown code. name: runner / remark-lint runs-on: ubuntu-latest steps: @@ -36,7 +36,7 @@ jobs: github_token: ${{ secrets.GITHUB_TOKEN }} reporter: github-check level: warning - black: # Check python code format + black: # Check python code format. name: runner / black runs-on: ubuntu-latest steps: @@ -46,7 +46,7 @@ jobs: github_token: ${{ secrets.GITHUB_TOKEN }} reporter: github-check level: warning - flake8: # Lints python code + flake8: # Lints python code. name: runner / flake8 runs-on: ubuntu-latest steps: @@ -72,9 +72,9 @@ jobs: name: python-tests (Testing) runs-on: ubuntu-latest strategy: - fail-fast: false # Run all matrix jobs + fail-fast: false # Run all matrix jobs. matrix: - python-version: [3.8, 3.9, "3.10"] # Supported python versions + python-version: [3.8, 3.9, "3.10"] # Supported python versions. steps: - name: Checkout stable-learning-control repository uses: actions/checkout@v3 diff --git a/docs/Makefile b/docs/Makefile index 8ff39e1b6..c760baea6 100644 --- a/docs/Makefile +++ b/docs/Makefile @@ -1,6 +1,6 @@ # Makefile for Stable Learning Control Sphinx documentation -# You can set these variables from the command line. +# You can set these variables from the command line SPHINXOPTS = SPHINXBUILD = sphinx-build SPHINXPROJ = Stable Learning Control diff --git a/docs/source/conf.py b/docs/source/conf.py index 059a95a40..677176391 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -68,12 +68,12 @@ def __getattr__(cls, name): "myst_parser", ] -# Extension settings +# Extension settings. autosummary_generate = True autosummary_generate_overwrite = True autodoc_member_order = "bysource" -# imgmath settings +# imgmath settings. imgmath_image_format = "svg" imgmath_font_size = 14 @@ -110,7 +110,7 @@ def __getattr__(cls, name): # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. -# This patterns also effect to html_static_path and html_extra_path +# This patterns also effect to html_static_path and html_extra_path. exclude_patterns = ["_build", "Thumbs.db", ".DS_Store", "TODO.*", "README.*"] # The name of the Pygments (syntax highlighting) style to use. diff --git a/docs/source/control/eval_robustness.rst b/docs/source/control/eval_robustness.rst index dfcb865ce..8e2cd28ab 100644 --- a/docs/source/control/eval_robustness.rst +++ b/docs/source/control/eval_robustness.rst @@ -99,15 +99,15 @@ under several Impulse disturbances. if __name__ == "__main__": - # Parse input arguments + # Parse input arguments. parser = argparse.ArgumentParser() parser.add_argument("fpath", type=str, help="The path where the policy is stored") args = parser.parse_args() - # Retrieve dataframe + # Retrieve dataframe. robustness_eval_df = pd.read_csv(Path(args.fpath).absolute()) - # Retrieve observation and reference data from the dataframe + # Retrieve observation and reference data from the dataframe. o_disturbances_df = robustness_eval_df.query("variable == 'observation'").dropna( axis=1, how="all" ) @@ -115,7 +115,7 @@ under several Impulse disturbances. axis=1, how="all" ) - # Merge observations and references into one dataframe + # Merge observations and references into one dataframe. obs_df_tmp = o_disturbances_df.query("observation == 3") obs_df_tmp["signal"] = "obs_" + (obs_df_tmp["observation"] + 1).astype(str) obs_df_tmp.insert(len(obs_df_tmp.columns), "type", "observation") @@ -190,14 +190,14 @@ class to add all the required methods and attributes to make it compatible with import numpy as np from stable_learning_control.simzoo.simzoo.common.disturber import Disturber - # Disturber config used to overwrite the default config + # Disturber config used to overwrite the default config. DISTURBER_CFG = { - # Disturbance applied to environment variables + # Disturbance applied to environment variables. "env": { "description": "Pole length disturbance", - # The env variable which you want to disturb + # The env variable which you want to disturb. "variable": "length", - # The range of values you want to use for each disturbance iteration + # The range of values you want to use for each disturbance iteration. "variable_range": np.linspace(0.5, 2.0, num=5, dtype=np.float32), # Label used in robustness plots. "label": "r: %s", @@ -271,10 +271,10 @@ When editing the ``DISTURBANCE_CFG`` config in the :class:`~stable_learning_cont :linenos: :emphasize-lines: 5 - # A random noise that is applied at every timestep + # A random noise that is applied at every timestep. "noise": { "description": "Random noise disturbance", - # The means and standards deviations of the random noise disturbance + # The means and standards deviations of the random noise disturbance. "noise_range": { "mean": np.linspace(80, 155, num=3, dtype=np.int16), "std": np.linspace(1.0, 5.0, num=3, dtype=np.int16), @@ -292,14 +292,14 @@ When editing the ``DISTURBANCE_CFG`` config in the :class:`~stable_learning_cont # Disturbance applied to the *OUTPUT* of the environment step function "output": { - # The disturbance variant used when no variant is given + # The disturbance variant used when no variant is given. "default_variant": "impulse", - # A random noise that is applied at every timestep + # A random noise that is applied at every timestep. "noise": { "description": "Random noise disturbance", - # The means and standards deviations of the random noise disturbance + # The means and standards deviations of the random noise disturbance. "noise_range": { - # "mean": np.linspace(80, 155, num=3, dtype=np.int16), # All obs + # "mean": np.linspace(80, 155, num=3, dtype=np.int16), # All obs. "mean": np.vstack( ( np.linspace(80, 155, num=3, dtype=np.int16), # Obs 1 @@ -308,7 +308,7 @@ When editing the ``DISTURBANCE_CFG`` config in the :class:`~stable_learning_cont np.linspace(80, 155, num=3, dtype=np.int16), # Obs 4 ) ).T, - # "std": np.linspace(1.0, 5.0, num=3, dtype=np.int16), # All Obs + # "std": np.linspace(1.0, 5.0, num=3, dtype=np.int16), # All Obs. "std": np.vstack( ( np.linspace(1.0, 5.0, num=3, dtype=np.int16), # Obs 1 @@ -332,12 +332,12 @@ When editing the ``DISTURBANCE_CFG`` config in the :class:`~stable_learning_cont :linenos: :emphasize-lines: 4, 12 - # Input and output noise disturbance + # Input and output noise disturbance. "noise": { "description": "Random input and output noise disturbance", "input_noise": { # The means and standards deviations of the random input noise - # disturbance + # disturbance. "noise_range": { "mean": np.linspace(80, 155, num=3, dtype=np.int16), "std": np.linspace(1.0, 5.0, num=3, dtype=np.int16), @@ -345,7 +345,7 @@ When editing the ``DISTURBANCE_CFG`` config in the :class:`~stable_learning_cont }, "output_noise": { # The means and standards deviations of the random output noise - # disturbance + # disturbance. "noise_range": { "mean": np.linspace(80, 155, num=3, dtype=np.int16), "std": np.linspace(1.0, 5.0, num=3, dtype=np.int16), diff --git a/docs/source/control/saving_and_loading.rst b/docs/source/control/saving_and_loading.rst index 0bf47d98c..71a929f0d 100644 --- a/docs/source/control/saving_and_loading.rst +++ b/docs/source/control/saving_and_loading.rst @@ -189,10 +189,10 @@ the :torch:`PyTorch documentation = 0 else "last" @@ -134,17 +134,17 @@ def noise_disturbance(mean, std): ) sys.exit(0) - # Remove action clipping if present + # Remove action clipping if present. if hasattr(env.unwrapped, "_clipped_action"): env.unwrapped._clipped_action = False - # Setup logger + # Setup logger. output_dir = Path(args.fpath).joinpath("eval") logger = EpochLogger( verbose_fmt="table", output_dir=output_dir, output_fname="eval_statistics.csv" ) - # Set max episode length + # Set max episode length. if args.len is None: max_ep_len = env._max_episode_steps else: @@ -181,7 +181,7 @@ def noise_disturbance(mean, std): n_disturbance = 0 disturbances_length = len(disturbance_range["mean"]) soi_found, ref_found = True, True - supports_deterministic = True # Only supported with gaussian algorithms + supports_deterministic = True # Only supported with gaussian algorithms. log_to_std_out("Adding random observation noise.", type="info") for _ in range(0, disturbances_length): o, r, d, ep_ret, ep_len, n = env.reset(), 0, False, 0, 0, 0 @@ -199,9 +199,9 @@ def noise_disturbance(mean, std): f"Disturbance {n_disturbance}: mean: {mean}, std: {std}", type="info" ) - # Perform disturbed episodes + # Perform disturbed episodes. while n < args.episodes: - # Render env if requested + # Render env if requested. if args.render and not render_error: try: env.render() @@ -217,7 +217,7 @@ def noise_disturbance(mean, std): type="warning", ) - # Retrieve action + # Retrieve action. if args.deterministic and supports_deterministic: try: a = policy.get_action(o, deterministic=args.deterministic) @@ -243,7 +243,7 @@ def noise_disturbance(mean, std): ) # NOTE: In this example we add a small random noise to the action o, r, d, info = env.step(a) - # Increment counters + # Increment counters. ep_ret += r ep_len += 1 ################################################ @@ -275,7 +275,7 @@ def noise_disturbance(mean, std): type="warning", ) - # Store performance measurements + # Store performance measurements. if d or (ep_len == max_ep_len): died = ep_len < max_ep_len logger.store(EpRet=ep_ret, EpLen=ep_len, DeathRate=(float(died))) @@ -284,24 +284,24 @@ def noise_disturbance(mean, std): % (n, ep_ret, ep_len, died) ) - # Store observations + # Store observations. o_episode_df = pd.DataFrame(path["o"]) o_episode_df.insert(0, "step", range(0, ep_len)) o_episode_df = pd.melt( o_episode_df, id_vars="step", var_name="observation", - ) # Flatten robustness_eval_df + ) # Flatten robustness_eval_df. o_episodes_dfs.append(o_episode_df) - # Store episode rewards + # Store episode rewards. r_episode_df = pd.DataFrame( {"step": range(0, ep_len), "reward": path["r"]} ) r_episode_df.insert(len(r_episode_df.columns), "episode", n) r_episodes_dfs.append(r_episode_df) - # Store states of interest + # Store states of interest. if soi_found: soi_episode_df = pd.DataFrame(path["state_of_interest"]) soi_episode_df.insert(0, "step", range(0, ep_len)) @@ -310,10 +310,10 @@ def noise_disturbance(mean, std): id_vars="step", var_name="state_of_interest", value_name="error", - ) # Flatten robustness_eval_df + ) # Flatten robustness_eval_df. soi_episodes_dfs.append(soi_episode_df) - # Store reference + # Store reference. if ref_found: ref_episode_df = pd.DataFrame(path["reference"]) ref_episode_df.insert(0, "step", range(0, ep_len)) @@ -321,10 +321,10 @@ def noise_disturbance(mean, std): ref_episode_df, id_vars="step", var_name="reference", - ) # Flatten robustness_eval_df + ) # Flatten robustness_eval_df. ref_episodes_dfs.append(ref_episode_df) - # Increment counters and reset storage variables + # Increment counters and reset storage variables. n += 1 o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 path = { @@ -334,14 +334,14 @@ def noise_disturbance(mean, std): "state_of_interest": [], } - # Print robustness evaluation diagnostics + # Print robustness evaluation diagnostics. logger.log_tabular("EpRet", with_min_and_max=True) logger.log_tabular("EpLen", average_only=True) logger.log_tabular("DeathRate") log_to_std_out("") logger.dump_tabular() - # Add extra disturbance information to the robustness eval robustness_eval_df + # Add extra disturbance information to the robustness eval robustness_eval_df. disturbance_label = ( env.disturbance_info["label"] if ( @@ -383,7 +383,7 @@ def noise_disturbance(mean, std): ) ref_disturbances_dfs.append(ref_disturbance_df) - # Reset storage buckets and go to next disturbance + # Reset storage buckets and go to next disturbance. o_episodes_dfs = [] r_episodes_dfs = [] soi_episodes_dfs = [] @@ -396,7 +396,7 @@ def noise_disturbance(mean, std): n_disturbance += 1 ################################################ - # Merge robustness evaluation information for all disturbances + # Merge robustness evaluation information for all disturbances. o_disturbances_df = pd.concat(o_disturbances_dfs, ignore_index=True) r_disturbances_df = pd.concat(r_disturbances_dfs, ignore_index=True) soi_disturbances_df = pd.concat(soi_disturbances_dfs, ignore_index=True) @@ -431,7 +431,7 @@ def noise_disturbance(mean, std): disturbance_variant, ) - # Save robustness evaluation robustness_eval_df and return it to the user + # Save robustness evaluation robustness_eval_df and return it to the user. if args.save_result: results_path = logger.output_dir.joinpath("results.csv") logger.log( @@ -450,7 +450,7 @@ def noise_disturbance(mean, std): log_to_std_out("Showing robustness evaluation plots...", type="info") sns.set(style="darkgrid", font_scale=args.font_scale) - # Unpack required data from robustness_eval_df + # Unpack required data from robustness_eval_df. obs_found, rew_found, soi_found, ref_found = True, True, True, True o_disturbances_df, ref_disturbances_df = ( pd.DataFrame(), @@ -481,13 +481,13 @@ def noise_disturbance(mean, std): else: ref_found = False - # Merge observations and references + # Merge observations and references. if obs_found: obs_df_tmp = o_disturbances_df.copy(deep=True) obs_df_tmp["signal"] = "obs_" + (obs_df_tmp["observation"] + 1).astype(str) obs_df_tmp.insert(len(obs_df_tmp.columns), "type", "observation") - # Retrieve the requested observations + # Retrieve the requested observations. observations = args.observations if hasattr(args, "observations") else None observations = validate_observations(observations, o_disturbances_df) observations = [obs - 1 for obs in observations] # Humans count from 1 @@ -498,7 +498,7 @@ def noise_disturbance(mean, std): ref_df_tmp.insert(len(ref_df_tmp.columns), "type", "reference") obs_ref_df = pd.concat([obs_df_tmp, ref_df_tmp], ignore_index=True) - # Loop though all disturbances and plot the observations and references in one plot + # Loop though all disturbances and plot the observations and references in one plot. fig_title = "{} under several {}.".format( "Observation and reference" if all([obs_found, ref_found]) @@ -510,7 +510,7 @@ def noise_disturbance(mean, std): obs_ref_df.loc[obs_ref_df["disturbance_index"] == 0, "disturbance"] = ( obs_ref_df.loc[obs_ref_df["disturbance_index"] == 0, "disturbance"] + " (original)" - ) # Append original to original value + ) # Append original to original value. if not args.merged: num_plots = len(obs_ref_df.disturbance.unique()) total_cols = 3 @@ -546,7 +546,7 @@ def noise_disturbance(mean, std): ).set_title(fig_title) figs["observations"].append(fig) - # Plot mean cost + # Plot mean cost. if rew_found: fig = plt.figure(tight_layout=True) figs["costs"].append(fig) @@ -557,7 +557,7 @@ def noise_disturbance(mean, std): r_disturbances_df["disturbance_index"] == 0, "disturbance" ] + " (original)" - ) # Append original to original value + ) # Append original to original value. sns.lineplot( data=r_disturbances_df, x="step", y="reward", ci="sd", hue="disturbance" ).set_title( @@ -576,7 +576,7 @@ def noise_disturbance(mean, std): type="warning", ) - # Plot states of interest + # Plot states of interest. if soi_found: n_soi = soi_disturbances_df["state_of_interest"].max() + 1 soi_disturbances_df.loc[ @@ -586,7 +586,7 @@ def noise_disturbance(mean, std): soi_disturbances_df["disturbance_index"] == 0, "disturbance" ] + " (original)" - ) # Append original to original value + ) # Append original to original value. for index in range(0, n_soi): fig = plt.figure(tight_layout=True) figs["states_of_interest"].append(fig) @@ -614,7 +614,7 @@ def noise_disturbance(mean, std): type="warning", ) - # Save plots + # Save plots. if args.save_figs: figs_path = output_dir.joinpath("figures") figs_extension = ( diff --git a/examples/manual_env_policy_inference.py b/examples/manual_env_policy_inference.py index e7bfab369..9908dbf7b 100644 --- a/examples/manual_env_policy_inference.py +++ b/examples/manual_env_policy_inference.py @@ -22,19 +22,19 @@ # NOTE: STEP: 1b: If step 1 fails recreate the environment and load the Pytorch/ # TF2 agent separately. - # Create the environment + # Create the environment. # NOTE: Here the 'FlattenObservation' wrapper is used to make sure the alg works # with dictionary based observation spaces. env = gym.make("PandaReach-v1") env = gym.wrappers.FlattenObservation(env) - # Load the policy + # Load the policy. if AGENT_TYPE.lower() == "tf2": policy = load_tf_policy(AGENT_FOLDER, itr="last", env=env) # Load TF2 agent else: policy = load_pytorch_policy( AGENT_FOLDER, env=env, itr="last" - ) # Load Pytorch agent + ) # Load Pytorch agent. # NOTE: Step 2: Try to run the policy on the environment. try: diff --git a/examples/pytorch/lac_ray_hyper_parameter_tuning.py b/examples/pytorch/lac_ray_hyper_parameter_tuning.py index b938d8787..66469c6ec 100644 --- a/examples/pytorch/lac_ray_hyper_parameter_tuning.py +++ b/examples/pytorch/lac_ray_hyper_parameter_tuning.py @@ -21,7 +21,7 @@ import gymnasium as gym import numpy as np -# Import the algorithm we want to tune +# Import the algorithm we want to tune. from stable_learning_control.control.algos.pytorch.lac.lac import lac from stable_learning_control.utils.import_utils import lazy_importer @@ -40,10 +40,10 @@ def train_lac(config): config (dict): The Ray tuning configuration dictionary. """ - # Unpack trainable arguments + # Unpack trainable arguments. env_name = config.pop("env_name") - # Run algorithm training + # Run algorithm training. lac( lambda: gym.make(env_name), **config, @@ -51,18 +51,18 @@ def train_lac(config): if __name__ == "__main__": - # Pass system arguments to ray + # Pass system arguments to ray. if len(sys.argv) > 1: ray.init(redis_address=sys.argv[1]) - # Setup the logging dir + # Setup the logging dir. dirname = osp.dirname(__file__) log_path = osp.abspath(osp.join(dirname, "../../data/ray_results")) - # Setup hyperparameter search starting point + # Setup hyperparameter search starting point. current_best_params = [{"gamma": 0.995, "lr_a": 1e-4, "alpha3": 0.2}] - # Setup the parameter space for you hyperparameter search + # Setup the parameter space for you hyperparameter search. # NOTE: This script uses the hyperopt search algorithm for efficient hyperparameter # selection. For more information see # https://docs.ray.io/en/latest/tune/api_docs/suggestion.html?highlight=hyperopt. @@ -81,7 +81,7 @@ def train_lac(config): points_to_evaluate=current_best_params, ) - # Start the hyperparameter tuning job + # Start the hyperparameter tuning job. # NOTE: We use the ASHA job scheduler to early terminate bad trials, pause trials, # clone trials, and alter hyperparameters of a running trial. For more information # see https://docs.ray.io/en/master/tune/api_docs/schedulers.html. @@ -102,7 +102,7 @@ def train_lac(config): local_dir=log_path, ) - # Print the best trail + # Print the best trail. best_trial = analysis.get_best_trial(metric="mean_ep_ret", mode="min", scope="all") best_path = analysis.get_best_logdir(metric="mean_ep_ret", mode="min", scope="all") best_config = analysis.get_best_config( diff --git a/examples/pytorch/sac_exp_grid_search.py b/examples/pytorch/sac_exp_grid_search.py index f4c88a362..7f6422d94 100644 --- a/examples/pytorch/sac_exp_grid_search.py +++ b/examples/pytorch/sac_exp_grid_search.py @@ -19,10 +19,10 @@ import torch from stable_learning_control.control.utils.run_utils import ExperimentGrid -# Import the RL agent you want to perform the grid search for +# Import the RL agent you want to perform the grid search for. from stable_learning_control.control.algos.pytorch.sac.sac import sac -# Script parametesr +# Scriptparameters. ENV_NAME = "Oscillator-v1" # The environment on which you want to train the agent. if __name__ == "__main__": @@ -41,5 +41,5 @@ eg.add("ac_kwargs:hidden_sizes", [(32,), (64, 64)], "hid") eg.add("ac_kwargs:activation", [torch.nn.ReLU, torch.nn.ReLU], "") - # Run the grid search + # Run the grid search. eg.run(sac, num_cpu=args.cpu) diff --git a/examples/pytorch/sac_ray_hyper_parameter_tuning.py b/examples/pytorch/sac_ray_hyper_parameter_tuning.py index 6229962d6..14f8eed36 100644 --- a/examples/pytorch/sac_ray_hyper_parameter_tuning.py +++ b/examples/pytorch/sac_ray_hyper_parameter_tuning.py @@ -21,7 +21,7 @@ import gymnasium as gym import numpy as np -# Import the algorithm we want to tune +# Import the algorithm we want to tune. from stable_learning_control.control.algos.pytorch.sac.sac import sac from stable_learning_control.utils.import_utils import lazy_importer @@ -40,10 +40,10 @@ def train_sac(config): config (dict): The Ray tuning configuration dictionary. """ - # Unpack trainable arguments + # Unpack trainable arguments. env_name = config.pop("env_name") - # Run algorithm training + # Run algorithm training. sac( lambda: gym.make(env_name), **config, @@ -51,18 +51,18 @@ def train_sac(config): if __name__ == "__main__": - # Pass system arguments to ray + # Pass system arguments to ray. if len(sys.argv) > 1: ray.init(redis_address=sys.argv[1]) - # Setup the logging dir + # Setup the logging dir. dirname = osp.dirname(__file__) log_path = osp.abspath(osp.join(dirname, "../../data/ray_results")) - # Setup hyperparameter search starting point + # Setup hyperparameter search starting point. current_best_params = [{"gamma": 0.995, "lr_a": 1e-4, "alpha": 0.99}] - # Setup the parameter space for you hyperparameter search + # Setup the parameter space for you hyperparameter search. # NOTE: This script uses the hyperopt search algorithm for efficient hyperparameter # selection. For more information see # https://docs.ray.io/en/latest/tune/api_docs/suggestion.html?highlight=hyperopt. @@ -81,7 +81,7 @@ def train_sac(config): points_to_evaluate=current_best_params, ) - # Start the hyperparameter tuning job + # Start the hyperparameter tuning job. # NOTE: We use the ASHA job scheduler to early terminate bad trials, pause trials, # clone trials, and alter hyperparameters of a running trial. For more information # see https://docs.ray.io/en/master/tune/api_docs/schedulers.html. @@ -102,7 +102,7 @@ def train_sac(config): local_dir=log_path, ) - # Print the best trail + # Print the best trail. best_trial = analysis.get_best_trial(metric="mean_ep_ret", mode="min", scope="all") best_path = analysis.get_best_logdir(metric="mean_ep_ret", mode="min", scope="all") best_config = analysis.get_best_config( diff --git a/examples/tf2/lac_ray_hyper_parameter_tuning.py b/examples/tf2/lac_ray_hyper_parameter_tuning.py index 244a7b194..65d3a9c21 100644 --- a/examples/tf2/lac_ray_hyper_parameter_tuning.py +++ b/examples/tf2/lac_ray_hyper_parameter_tuning.py @@ -21,7 +21,7 @@ import gymnasium as gym import numpy as np -# Import the algorithm we want to tune +# Import the algorithm we want to tune. from stable_learning_control.control.algos.tf2.lac.lac import lac from stable_learning_control.utils.import_utils import lazy_importer @@ -40,10 +40,10 @@ def train_lac(config): config (dict): The Ray tuning configuration dictionary. """ - # Unpack trainable arguments + # Unpack trainable arguments. env_name = config.pop("env_name") - # Run algorithm training + # Run algorithm training. lac( lambda: gym.make(env_name), **config, @@ -51,18 +51,18 @@ def train_lac(config): if __name__ == "__main__": - # Pass system arguments to ray + # Pass system arguments to ray. if len(sys.argv) > 1: ray.init(redis_address=sys.argv[1]) - # Setup the logging dir + # Setup the logging dir. dirname = osp.dirname(__file__) log_path = osp.abspath(osp.join(dirname, "../../data/ray_results")) - # Setup hyperparameter search starting point + # Setup hyperparameter search starting point. current_best_params = [{"gamma": 0.995, "lr_a": 1e-4, "alpha3": 0.2}] - # Setup the parameter space for you hyperparameter search + # Setup the parameter space for you hyperparameter search. # NOTE: This script uses the hyperopt search algorithm for efficient hyperparameter # selection. For more information see # https://docs.ray.io/en/latest/tune/api_docs/suggestion.html?highlight=hyperopt. @@ -81,7 +81,7 @@ def train_lac(config): points_to_evaluate=current_best_params, ) - # Start the hyperparameter tuning job + # Start the hyperparameter tuning job. # NOTE: We use the ASHA job scheduler to early terminate bad trials, pause trials, # clone trials, and alter hyperparameters of a running trial. For more information # see https://docs.ray.io/en/master/tune/api_docs/schedulers.html. @@ -102,7 +102,7 @@ def train_lac(config): local_dir=log_path, ) - # Print the best trail + # Print the best trail. best_trial = analysis.get_best_trial(metric="mean_ep_ret", mode="min", scope="all") best_path = analysis.get_best_logdir(metric="mean_ep_ret", mode="min", scope="all") best_config = analysis.get_best_config( diff --git a/examples/tf2/sac_exp_grid_search.py b/examples/tf2/sac_exp_grid_search.py index a790b761f..5e0520d85 100644 --- a/examples/tf2/sac_exp_grid_search.py +++ b/examples/tf2/sac_exp_grid_search.py @@ -19,10 +19,10 @@ import tensorflow as tf from stable_learning_control.control.utils.run_utils import ExperimentGrid -# Import the RL agent you want to perform the grid search for +# Import the RL agent you want to perform the grid search for. from stable_learning_control.control.algos.tf2.sac.sac import sac -# Script parametesr +# Scriptparameters. ENV_NAME = "Oscillator-v1" # The environment on which you want to train the agent. if __name__ == "__main__": @@ -41,5 +41,5 @@ eg.add("ac_kwargs:hidden_sizes", [(32,), (64, 64)], "hid") eg.add("ac_kwargs:activation", [tf.nn.relu, tf.nn.relu], "") - # Run the grid search + # Run the grid search. eg.run(sac, num_cpu=args.cpu) diff --git a/examples/tf2/sac_ray_hyper_parameter_tuning.py b/examples/tf2/sac_ray_hyper_parameter_tuning.py index 2a2b5a3fc..c7ea83e2c 100644 --- a/examples/tf2/sac_ray_hyper_parameter_tuning.py +++ b/examples/tf2/sac_ray_hyper_parameter_tuning.py @@ -21,7 +21,7 @@ import gymnasium as gym import numpy as np -# Import the algorithm we want to tune +# Import the algorithm we want to tune. from stable_learning_control.control.algos.tf2.sac.sac import sac from stable_learning_control.utils.import_utils import lazy_importer @@ -40,10 +40,10 @@ def train_sac(config): config (dict): The Ray tuning configuration dictionary. """ - # Unpack trainable arguments + # Unpack trainable arguments. env_name = config.pop("env_name") - # Run algorithm training + # Run algorithm training. sac( lambda: gym.make(env_name), **config, @@ -51,18 +51,18 @@ def train_sac(config): if __name__ == "__main__": - # Pass system arguments to ray + # Pass system arguments to ray. if len(sys.argv) > 1: ray.init(redis_address=sys.argv[1]) - # Setup the logging dir + # Setup the logging dir. dirname = osp.dirname(__file__) log_path = osp.abspath(osp.join(dirname, "../../data/ray_results")) - # Setup hyperparameter search starting point + # Setup hyperparameter search starting point. current_best_params = [{"gamma": 0.995, "lr_a": 1e-4, "alpha": 0.99}] - # Setup the parameter space for you hyperparameter search + # Setup the parameter space for you hyperparameter search. # NOTE: This script uses the hyperopt search algorithm for efficient hyperparameter # selection. For more information see # https://docs.ray.io/en/latest/tune/api_docs/suggestion.html?highlight=hyperopt. @@ -81,7 +81,7 @@ def train_sac(config): points_to_evaluate=current_best_params, ) - # Start the hyperparameter tuning job + # Start the hyperparameter tuning job. # NOTE: We use the ASHA job scheduler to early terminate bad trials, pause trials, # clone trials, and alter hyperparameters of a running trial. For more information # see https://docs.ray.io/en/master/tune/api_docs/schedulers.html. @@ -102,7 +102,7 @@ def train_sac(config): local_dir=log_path, ) - # Print the best trail + # Print the best trail. best_trial = analysis.get_best_trial(metric="mean_ep_ret", mode="min", scope="all") best_path = analysis.get_best_logdir(metric="mean_ep_ret", mode="min", scope="all") best_config = analysis.get_best_config( diff --git a/sandbox/test_algorithm_seeding.py b/sandbox/test_algorithm_seeding.py new file mode 100644 index 000000000..e08083953 --- /dev/null +++ b/sandbox/test_algorithm_seeding.py @@ -0,0 +1,44 @@ +"""Test the LAC seeding process.""" + +from stable_learning_control.control.algos.pytorch.lac.lac import LAC +from stable_learning_control.control.algos.pytorch.lac.lac import LyapunovActorCritic +import stable_gym # noqa: F401 +import gymnasium as gym +from gymnasium.utils import seeding + +if __name__ == "__main__": + # Create environment. + env = gym.make("Oscillator-v1") + + # Seed the environment. + generator, seed = seeding.np_random(0) + env.np_random = generator + test = generator.random() + test2 = generator.random() + test3 = generator.random() + test4 = generator.random() + env.action_space.seed(seed) + env.observation_space.seed(seed) + test_act = env.action_space.sample() + test_obs = env.observation_space.sample() + + # Check the environment. + obs, info = env.reset(seed=0) + test_act = env.action_space.sample() + test_obs = env.observation_space.sample() + print(f"Initial observation: {obs}") + print(f"Initial info: {info}") + terminated, truncated = False, False + while not (terminated or truncated): + action = env.action_space.sample() + obs, reward, terminated, truncated, info = env.step(action) + print(f"Action: {action}") + print(f"Observation: {obs}") + print(f"Reward: {reward}") + print(f"Terminated: {terminated}") + print(f"Info: {info}") + truncated = info.get("TimeLimit.truncated", False) + + # Create agent and policy. + agent = LyapunovActorCritic() + policy = LAC() diff --git a/sandbox/test_gym_env.py b/sandbox/test_gym_env.py index 6c3948171..ce44cfbeb 100644 --- a/sandbox/test_gym_env.py +++ b/sandbox/test_gym_env.py @@ -17,7 +17,7 @@ if __name__ == "__main__": env = gym.make(ENV_NAME) - # Take T steps in the environment + # Take T steps in the environment. T = 1000 tau = 0.1 path = [] @@ -39,7 +39,7 @@ t1.append(i * tau) print("Finished Cartpole environment simulation.") - # Plot results + # Plot results. print("Plot results.") fig = plt.figure(figsize=(9, 6)) ax = fig.add_subplot(111) diff --git a/sandbox/test_traj_buffer.py b/sandbox/test_traj_buffer.py index 6e1578d00..f700d34e8 100644 --- a/sandbox/test_traj_buffer.py +++ b/sandbox/test_traj_buffer.py @@ -12,10 +12,10 @@ if __name__ == "__main__": - # Create dummy environment + # Create dummy environment. env = gym.make("CartPoleCost-v0") - # Dummy algorithm settings + # Dummy algorithm settings. obs_dim = env.observation_space.shape[0] act_dim = env.action_space.shape[0] rew_dim = env.reward_range.shape[0] @@ -23,7 +23,7 @@ epochs = 10 local_steps_per_epoch = 100 - # Create Memory Buffer + # Create Memory Buffer. buffer = TrajectoryBuffer( obs_dim=obs_dim, act_dim=act_dim, @@ -33,26 +33,26 @@ incomplete=True, ) - # Create test dummy data + # Create test dummy data. o, ep_ret, ep_len = env.reset(), 0, 0 for epoch in range(epochs): for t in range(local_steps_per_epoch): - # Retrieve data from the environment + # Retrieve data from the environment. a = env.action_space.sample() next_o, r, d, _ = env.step(a) - # Store data in buffer + # Store data in buffer. buffer.store(o, a, r, next_o, d) # Update obs (critical!) o = next_o - # Finish path + # Finish path. if d: buffer.finish_path() o, ep_ret, ep_len = env.reset(), 0, 0 - # Retrieve data from buffer + # Retrieve data from buffer. buffer_data = buffer.get(flat=False) print("test") diff --git a/stable_learning_control/control/algos/__init__.py b/stable_learning_control/control/algos/__init__.py index 83c1bdf6c..85507ce58 100644 --- a/stable_learning_control/control/algos/__init__.py +++ b/stable_learning_control/control/algos/__init__.py @@ -3,7 +3,7 @@ from stable_learning_control.utils.import_utils import import_tf -# Put algorithms on namespace +# Put algorithms on namespace. from stable_learning_control.control.algos.pytorch.lac.lac import LAC as LAC_pytorch from stable_learning_control.control.algos.pytorch.sac.sac import SAC as SAC_pytorch diff --git a/stable_learning_control/control/algos/pytorch/common/buffers.py b/stable_learning_control/control/algos/pytorch/common/buffers.py index 952a8774b..30163a87e 100644 --- a/stable_learning_control/control/algos/pytorch/common/buffers.py +++ b/stable_learning_control/control/algos/pytorch/common/buffers.py @@ -44,7 +44,7 @@ def sample_batch(self, *args, **kwargs): super().sample_batch(*args, **kwargs), dtype=torch.float32, device=self.device, - ) # Make sure output is a torch tensor + ) # Make sure output is a torch tensor. class TrajectoryBuffer(TrajectoryBuffer): @@ -84,4 +84,4 @@ def get(self, *args, **kwargs): """ return np_to_torch( super().get(*args, **kwargs), dtype=torch.float32, device=self.device - ) # Make sure output is a torch tensor + ) # Make sure output is a torch tensor. diff --git a/stable_learning_control/control/algos/pytorch/common/get_lr_scheduler.py b/stable_learning_control/control/algos/pytorch/common/get_lr_scheduler.py index 7b86100f2..4320a65cf 100644 --- a/stable_learning_control/control/algos/pytorch/common/get_lr_scheduler.py +++ b/stable_learning_control/control/algos/pytorch/common/get_lr_scheduler.py @@ -102,4 +102,4 @@ def lr_multiplier_function(step): else: return torch.optim.lr_scheduler.LambdaLR( optimizer, lr_lambda=lambda step: np.longdouble(1.0) - ) # Return a constant function + ) # Return a constant function. diff --git a/stable_learning_control/control/algos/pytorch/common/helpers.py b/stable_learning_control/control/algos/pytorch/common/helpers.py index e3763a45c..3418d35ed 100644 --- a/stable_learning_control/control/algos/pytorch/common/helpers.py +++ b/stable_learning_control/control/algos/pytorch/common/helpers.py @@ -53,7 +53,7 @@ def mlp(sizes, activation, output_activation=nn.Identity): Returns: torch.nn.Sequential: The multi-layered perceptron. """ # noqa: E501 - # Try to retrieve the activation function if a string was supplied + # Try to retrieve the activation function if a string was supplied. if isinstance(activation, str): activation = get_activation_function(activation, backend="torch") if isinstance(output_activation, str): diff --git a/stable_learning_control/control/algos/pytorch/lac/lac.py b/stable_learning_control/control/algos/pytorch/lac/lac.py index 3d66874e4..1c707236f 100644 --- a/stable_learning_control/control/algos/pytorch/lac/lac.py +++ b/stable_learning_control/control/algos/pytorch/lac/lac.py @@ -60,15 +60,15 @@ setup_logger_kwargs, ) -# Import ray tuner if installed +# Import ray tuner if installed. tune = lazy_importer(module_name="ray.tune") -# Script settings +# Script settings. SCALE_LAMBDA_MIN_MAX = ( 0.0, 1.0, -) # Range of lambda lagrance multiplier -SCALE_ALPHA_MIN_MAX = (0.0, np.inf) # Range of alpha lagrance multiplier +) # Range of lambda lagrance multiplier. +SCALE_ALPHA_MIN_MAX = (0.0, np.inf) # Range of alpha lagrance multiplier. STD_OUT_LOG_VARS_DEFAULT = [ "Epoch", "TotalEnvInteracts", @@ -210,7 +210,7 @@ def __init__( # noqa: C901 k: v for k, v in locals().items() if k not in ["self", "__class__", "env"] } - # Validate gymnasium env + # Validate gymnasium env. # NOTE: The current implementation only works with continuous spaces. if not is_gym_env(env): raise ValueError("Env must be a valid gymnasium environment.") @@ -246,7 +246,7 @@ def __init__( # noqa: C901 type="info", ) - # Store algorithm parameters + # Store algorithm parameters. self._act_dim = env.action_space.shape self._obs_dim = env.observation_space.shape self._device = retrieve_device(device) @@ -265,7 +265,7 @@ def __init__( # noqa: C901 else: self._target_entropy = target_entropy - # Create variables for the Lagrance multipliers + # Create variables for the Lagrance multipliers. # NOTE: Clip at 1e-37 to prevent log_alpha/log_lambda from becoming -np.inf self.log_alpha = nn.Parameter( torch.tensor(np.log(1e-37 if alpha < 1e-37 else alpha), requires_grad=True) @@ -291,7 +291,7 @@ def __init__( # noqa: C901 for p in self.ac_targ.parameters(): p.requires_grad = False - # Create optimizers + # Create optimizers. # NOTE: We here optimize for log_alpha and log_labda instead of alpha and labda # because it is more numerically stable (see: # https://github.com/rail-berkeley/softlearning/issues/136) @@ -367,9 +367,9 @@ def update(self, data): # noqa: C901 o_ ) # NOTE: Target actions come from *current* *target* policy l_pi_targ = self.ac_targ.L(o_, pi_targ_) - l_backup = r + self._gamma * (1 - d) * l_pi_targ # The Lyapunov candidate + l_backup = r + self._gamma * (1 - d) * l_pi_targ # The Lyapunov candidate. - # Get current Lyapunov value + # Get current Lyapunov value. l1 = self.ac.L(o, a) # Calculate Lyapunov *CRITIC* error @@ -407,11 +407,11 @@ def update(self, data): # noqa: C901 "if you need this." ) - # Get target lyapunov value + # Get target lyapunov value. pi_, _ = self.ac.pi(o_) # NOTE: Target actions come from *current* policy lya_l_ = self.ac.L(o_, pi_) - # Compute Lyapunov Actor error + # Compute Lyapunov Actor error. l_delta = torch.mean(lya_l_ - l1.detach() + self._alpha3 * r) # See Han eq. 11 # Calculate entropy-regularized policy loss @@ -437,7 +437,7 @@ def update(self, data): # noqa: C901 if self._adaptive_temperature: self._log_alpha_optimizer.zero_grad() - # Calculate alpha loss + # Calculate alpha loss. alpha_loss = -( self.alpha * (logp_pi.detach() + self.target_entropy) ).mean() # See Haarnoja eq. 17 @@ -454,7 +454,7 @@ def update(self, data): # noqa: C901 ################################################ self._log_labda_optimizer.zero_grad() - # Calculate labda loss + # Calculate labda loss. # NOTE: Log_labda was used in the lambda_loss function because using lambda # caused the gradients to vanish. This is caused since we restrict lambda # within a 0-1.0 range using the clamp function (see #38). Using log_lambda @@ -496,7 +496,7 @@ def save(self, path): except Exception as e: raise Exception("LAC model could not be saved.") from e - # Save additional information + # Save additional information. save_info = { "alg_name": self.__class__.__name__, "setup_kwargs": self._setup_kwargs, @@ -641,7 +641,7 @@ def state_dict(self): state_dict = super().state_dict() state_dict[ "alg_name" - ] = self.__class__.__name__ # Save algorithm name state dict + ] = self.__class__.__name__ # Save algorithm name state dict. return state_dict def bound_lr( @@ -958,7 +958,7 @@ def lac( # noqa: C901 env = env_fn() - # Validate gymnasium env + # Validate gymnasium env. # NOTE: The current implementation only works with continuous spaces. if not is_gym_env(env): raise ValueError("Env must be a valid gymnasium environment.") @@ -1004,9 +1004,9 @@ def lac( # noqa: C901 hyper_paramet_dict = { k: v for k, v in locals().items() if k not in ["logger"] } # Retrieve hyperparameters (Ignore logger object) - logger.save_config(hyper_paramet_dict) # Write hyperparameters to logger + logger.save_config(hyper_paramet_dict) # Write hyperparameters to logger. - # Retrieve max episode length + # Retrieve max episode length. if max_ep_len is None: max_ep_len = env.env._max_episode_steps else: @@ -1027,7 +1027,7 @@ def lac( # noqa: C901 # Get default actor critic if no 'actor_critic' was supplied actor_critic = LyapunovActorCritic if actor_critic is None else actor_critic - # Set random seed for reproducible results + # Set random seed for reproducible results. if seed is not None: os.environ["PYTHONHASHSEED"] = str(seed) torch.manual_seed(seed) @@ -1051,7 +1051,7 @@ def lac( # noqa: C901 device, ) - # Restore policy if supplied + # Restore policy if supplied. if start_policy is not None: logger.log(f"Restoring model from '{start_policy}'.", type="info") try: @@ -1074,13 +1074,13 @@ def lac( # noqa: C901 device=policy.device, ) - # Count variables and print network structure + # Count variables and print network structure. var_counts = tuple(count_vars(module) for module in [policy.ac.pi, policy.ac.L]) logger.log("Number of parameters: \t pi: %d, \t L: %d\n" % var_counts, type="info") logger.log("Network structure:\n", type="info") logger.log(policy.ac, end="\n\n") - # Create learning rate schedulers + # Create learning rate schedulers. opt_schedulers = [] lr_decay_ref_var = total_steps if lr_decay_ref.lower() == "steps" else epochs pi_opt_scheduler = get_lr_scheduler( @@ -1106,7 +1106,7 @@ def lac( # noqa: C901 logger.setup_pytorch_saver(policy) - # Setup diagnostics tb_write dict and store initial learning rates + # Setup diagnostics tb_write dict and store initial learning rates. diag_tb_log_list = [ "ErrorL", "LossPi", @@ -1155,7 +1155,7 @@ def lac( # noqa: C901 else: a = env.action_space.sample() - # Take step in the env + # Take step in the env. o_, r, d, truncated, _ = env.step(a) ep_ret += r ep_len += 1 @@ -1165,28 +1165,28 @@ def lac( # noqa: C901 # Make sure to update most recent observation! o = o_ - # End of trajectory handling + # End of trajectory handling. if d or truncated: logger.store(EpRet=ep_ret, EpLen=ep_len) o, _ = env.reset() ep_ret, ep_len = 0, 0 - # Update handling + # Update handling. if (t + 1) >= update_after and ((t + 1) - update_after) % update_every == 0: - # Step based learning rate decay + # Step based learning rate decay. if lr_decay_ref.lower() == "step": for scheduler in opt_schedulers: scheduler.step() policy.bound_lr( lr_a_final, lr_c_final, lr_a_final, lr_a_final - ) # Make sure lr is bounded above the final lr + ) # Make sure lr is bounded above the final lr. for _ in range(steps_per_update): batch = replay_buffer.sample_batch(batch_size) update_diagnostics = policy.update(data=batch) - logger.store(**update_diagnostics) # Log diagnostics + logger.store(**update_diagnostics) # Log diagnostics. - # SGD batch tb logging + # SGD batch tb logging. if use_tensorboard and not tb_low_log_freq: logger.log_to_tb(keys=diag_tb_log_list, global_step=t) @@ -1194,11 +1194,11 @@ def lac( # noqa: C901 if (t + 1) % steps_per_epoch == 0: epoch = (t + 1) // steps_per_epoch - # Save model + # Save model. if (epoch % save_freq == 0) or (epoch == epochs): logger.save_state({"env": env}, itr=epoch) - # Test the performance of the deterministic version of the agent + # Test the performance of the deterministic version of the agent. if num_test_episodes != 0: eps_ret, eps_len = test_agent( policy, test_env, num_test_episodes, max_ep_len=max_ep_len @@ -1209,15 +1209,15 @@ def lac( # noqa: C901 extend=True, ) - # Epoch based learning rate decay + # Epoch based learning rate decay. if lr_decay_ref.lower() != "step": for scheduler in opt_schedulers: scheduler.step() policy.bound_lr( lr_a_final, lr_c_final, lr_a_final, lr_a_final - ) # Make sure lr is bounded above the final lr + ) # Make sure lr is bounded above the final lr. - # Log performance measure to ray tuning + # Log performance measure to ray tuning. # NOTE: Only executed when the ray tuner invokes the script if hasattr(tune, "session") and tune.session._session is not None: mean_ep_ret = logger.get_stats("EpRet") @@ -1226,7 +1226,7 @@ def lac( # noqa: C901 mean_ep_ret=mean_ep_ret[0], epoch=epoch, mean_ep_len=mean_ep_len[0] ) - # Log info about epoch + # Log info about epoch. logger.log_tabular("Epoch", epoch) logger.log_tabular("TotalEnvInteracts", t) logger.log_tabular( @@ -1546,7 +1546,7 @@ def lac( # noqa: C901 ), ) - # Parse logger related arguments + # Parse logger related arguments. parser.add_argument( "--exp_name", type=str, @@ -1605,7 +1605,7 @@ def lac( # noqa: C901 ) args = parser.parse_args() - # Setup actor critic arguments + # Setup actor critic arguments. output_activation = {} output_activation["actor"] = safer_eval(args.act_out_a, backend="torch") ac_kwargs = dict( @@ -1620,7 +1620,7 @@ def lac( # noqa: C901 output_activation=output_activation, ) - # Setup output dir for logger and return output kwargs + # Setup output dir for logger and return output kwargs. logger_kwargs = setup_logger_kwargs( args.exp_name, args.seed, diff --git a/stable_learning_control/control/algos/pytorch/policies/actors/squashed_gaussian_actor.py b/stable_learning_control/control/algos/pytorch/policies/actors/squashed_gaussian_actor.py index 234de7e24..a5d8b4cf7 100644 --- a/stable_learning_control/control/algos/pytorch/policies/actors/squashed_gaussian_actor.py +++ b/stable_learning_control/control/algos/pytorch/policies/actors/squashed_gaussian_actor.py @@ -89,7 +89,7 @@ def forward(self, obs, deterministic=False, with_logprob=True): - pi_action (:obj:`torch.Tensor`): The actions given by the policy. - logp_pi (:obj:`torch.Tensor`): The log probabilities of each of these actions. """ # noqa: E501 - # Make sure the observations are on the right device + # Make sure the observations are on the right device. if obs.device != self.net[0].weight.device: if not self.__device_warning_logged: device_warn_msg = ( @@ -109,7 +109,7 @@ def forward(self, obs, deterministic=False, with_logprob=True): self.__device_warning_logged = True obs = obs.to(self.net[0].weight.device) - # Calculate mean action and standard deviation + # Calculate mean action and standard deviation. net_out = self.net(obs) mu = self.mu_layer(net_out) log_std = self.log_std_layer(net_out) @@ -124,7 +124,7 @@ def forward(self, obs, deterministic=False, with_logprob=True): else: pi_action = ( pi_distribution.rsample() - ) # Sample while using the parameterization trick + ) # Sample while using the parameterization trick. # Compute logprob from Gaussian, and then apply correction for Tanh squashing. if with_logprob: @@ -141,10 +141,10 @@ def forward(self, obs, deterministic=False, with_logprob=True): else: logp_pi = None - # Calculate scaled action and return the action and its log probability + # Calculate scaled action and return the action and its log probability. pi_action = torch.tanh(pi_action) # Squash gaussian to be between -1 and 1 - # Clamp the actions such that they are in range of the environment + # Clamp the actions such that they are in range of the environment. if self.act_limits is not None: pi_action = clamp( pi_action, diff --git a/stable_learning_control/control/algos/pytorch/policies/critics/L_critic.py b/stable_learning_control/control/algos/pytorch/policies/critics/L_critic.py index b6cf0357c..364798c55 100644 --- a/stable_learning_control/control/algos/pytorch/policies/critics/L_critic.py +++ b/stable_learning_control/control/algos/pytorch/policies/critics/L_critic.py @@ -52,7 +52,7 @@ def forward(self, obs, act): The tensor containing the lyapunov values of the input observations and actions. """ - # Make sure the observations and actions are on the right device + # Make sure the observations and actions are on the right device. self._obs_same_device = obs.device != self.L[0].weight.device self._act_same_device = act.device != self.L[0].weight.device if self._obs_same_device or self._act_same_device: diff --git a/stable_learning_control/control/algos/pytorch/policies/critics/Q_critic.py b/stable_learning_control/control/algos/pytorch/policies/critics/Q_critic.py index 198ac638c..56c0c66f6 100644 --- a/stable_learning_control/control/algos/pytorch/policies/critics/Q_critic.py +++ b/stable_learning_control/control/algos/pytorch/policies/critics/Q_critic.py @@ -60,7 +60,7 @@ def forward(self, obs, act): The tensor containing the Q values of the input observations and actions. """ - # Make sure the observations and actions are on the right device + # Make sure the observations and actions are on the right device. self._obs_same_device = obs.device != self.Q[0].weight.device self._act_same_device = act.device != self.Q[0].weight.device if self._obs_same_device or self._act_same_device: diff --git a/stable_learning_control/control/algos/pytorch/sac/sac.py b/stable_learning_control/control/algos/pytorch/sac/sac.py index 916701420..f283c0777 100644 --- a/stable_learning_control/control/algos/pytorch/sac/sac.py +++ b/stable_learning_control/control/algos/pytorch/sac/sac.py @@ -61,15 +61,15 @@ setup_logger_kwargs, ) -# Import ray tuner if installed +# Import ray tuner if installed. tune = lazy_importer(module_name="ray.tune") -# Script settings +# Script settings. SCALE_LAMBDA_MIN_MAX = ( 0.0, 1.0, -) # Range of lambda lagrance multiplier -SCALE_ALPHA_MIN_MAX = (0.0, np.inf) # Range of alpha lagrance multiplier +) # Range of lambda lagrance multiplier. +SCALE_ALPHA_MIN_MAX = (0.0, np.inf) # Range of alpha lagrance multiplier. STD_OUT_LOG_VARS_DEFAULT = [ "Epoch", "TotalEnvInteracts", @@ -203,7 +203,7 @@ def __init__( # noqa: C901 k: v for k, v in locals().items() if k not in ["self", "__class__", "env"] } - # Validate gymnasium env + # Validate gymnasium env. # NOTE: The current implementation only works with continuous spaces. if not is_gym_env(env): raise ValueError("Env must be a valid gymnasium environment.") @@ -239,7 +239,7 @@ def __init__( # noqa: C901 type="info", ) - # Store algorithm parameters + # Store algorithm parameters. self._act_dim = env.action_space.shape self._obs_dim = env.observation_space.shape self._device = retrieve_device(device) @@ -256,7 +256,7 @@ def __init__( # noqa: C901 else: self._target_entropy = target_entropy - # Create variables for the Lagrance multipliers + # Create variables for the Lagrance multipliers. # NOTE: Clip at 1e-37 to prevent log_alpha/log_lambda from becoming -np.inf self.log_alpha = nn.Parameter( torch.tensor(np.log(1e-37 if alpha < 1e-37 else alpha), requires_grad=True) @@ -279,7 +279,7 @@ def __init__( # noqa: C901 for p in self.ac_targ.parameters(): p.requires_grad = False - # Create optimizers + # Create optimizers. # NOTE: We here optimize for log_alpha instead of alpha because it is more # numerically stable (see: # https://github.com/rail-berkeley/softlearning/issues/136) @@ -350,7 +350,7 @@ def update(self, data): # noqa: C901 o_ ) # NOTE: Target actions coming from *current* policy - # Get target Q values based on optimization type + # Get target Q values based on optimization type. q1_pi_targ = self.ac_targ.Q1(o_, pi_) q2_pi_targ = self.ac_targ.Q2(o_, pi_) if self._opt_type.lower() == "minimize": @@ -361,10 +361,10 @@ def update(self, data): # noqa: C901 else: q_pi_targ = torch.min( q1_pi_targ, q2_pi_targ - ) # Use min clipping to prevent overestimation bias + ) # Use min clipping to prevent overestimation bias. q_backup = r + self._gamma * (1 - d) * (q_pi_targ - self.alpha * logp_pi_) - # Retrieve the current Q values + # Retrieve the current Q values. q1 = self.ac.Q1(o, a) q2 = self.ac.Q2(o, a) @@ -393,7 +393,7 @@ def update(self, data): # noqa: C901 # Retrieve log probabilities of batch observations based on *current* policy pi, logp_pi = self.ac.pi(o) - # Retrieve current Q values + # Retrieve current Q values. # NOTE: Actions come from *current* policy q1_pi = self.ac.Q1(o, pi) q2_pi = self.ac.Q2(o, pi) @@ -428,7 +428,7 @@ def update(self, data): # noqa: C901 if self._adaptive_temperature: self._log_alpha_optimizer.zero_grad() - # Calculate alpha loss + # Calculate alpha loss. alpha_loss = -( self.alpha * (logp_pi.detach() + self.target_entropy) ).mean() # See Haarnoja eq. 17 @@ -467,7 +467,7 @@ def save(self, path): except Exception as e: raise Exception("SAC model could not be saved.") from e - # Save additional information + # Save additional information. save_info = { "alg_name": self.__class__.__name__, "setup_kwargs": self._setup_kwargs, @@ -612,7 +612,7 @@ def state_dict(self): state_dict = super().state_dict() state_dict[ "alg_name" - ] = self.__class__.__name__ # Save algorithm name state dict + ] = self.__class__.__name__ # Save algorithm name state dict. return state_dict def bound_lr(self, lr_a_final=None, lr_c_final=None, lr_alpha_final=None): @@ -895,7 +895,7 @@ def sac( # noqa: C901 env = env_fn() - # Validate gymnasium env + # Validate gymnasium env. # NOTE: The current implementation only works with continuous spaces. if not is_gym_env(env): raise ValueError("Env must be a valid gymnasium environment.") @@ -941,9 +941,9 @@ def sac( # noqa: C901 hyper_paramet_dict = { k: v for k, v in locals().items() if k not in ["logger"] } # Retrieve hyperparameters (Ignore logger object) - logger.save_config(hyper_paramet_dict) # Write hyperparameters to logger + logger.save_config(hyper_paramet_dict) # Write hyperparameters to logger. - # Retrieve max episode length + # Retrieve max episode length. if max_ep_len is None: max_ep_len = env.env._max_episode_steps else: @@ -964,7 +964,7 @@ def sac( # noqa: C901 # Get default actor critic if no 'actor_critic' was supplied actor_critic = SoftActorCritic if actor_critic is None else actor_critic - # Set random seed for reproducible results + # Set random seed for reproducible results. if seed is not None: os.environ["PYTHONHASHSEED"] = str(seed) torch.manual_seed(seed) @@ -986,7 +986,7 @@ def sac( # noqa: C901 device, ) - # Restore policy if supplied + # Restore policy if supplied. if start_policy is not None: logger.log(f"Restoring model from '{start_policy}'.", type="info") try: @@ -1009,7 +1009,7 @@ def sac( # noqa: C901 device=policy.device, ) - # Count variables and print network structure + # Count variables and print network structure. var_counts = tuple( count_vars(module) for module in [policy.ac.pi, policy.ac.Q1, policy.ac.Q2] ) @@ -1020,7 +1020,7 @@ def sac( # noqa: C901 logger.log("Network structure:\n", type="info") logger.log(policy.ac, end="\n\n") - # Create learning rate schedulers + # Create learning rate schedulers. opt_schedulers = [] lr_decay_ref_var = total_steps if lr_decay_ref.lower() == "steps" else epochs pi_opt_scheduler = get_lr_scheduler( @@ -1038,7 +1038,7 @@ def sac( # noqa: C901 logger.setup_pytorch_saver(policy) - # Setup diagnostics tb_write dict and store initial learning rates + # Setup diagnostics tb_write dict and store initial learning rates. diag_tb_log_list = ["LossQ", "LossPi", "Alpha", "LossAlpha", "Entropy"] if use_tensorboard: logger.log_to_tb( @@ -1073,7 +1073,7 @@ def sac( # noqa: C901 else: a = env.action_space.sample() - # Take step in the env + # Take step in the env. o_, r, d, truncated, _ = env.step(a) ep_ret += r ep_len += 1 @@ -1083,28 +1083,28 @@ def sac( # noqa: C901 # Make sure to update most recent observation! o = o_ - # End of trajectory handling + # End of trajectory handling. if d or truncated: logger.store(EpRet=ep_ret, EpLen=ep_len) o, _ = env.reset() ep_ret, ep_len = 0, 0 - # Update handling + # Update handling. if (t + 1) >= update_after and ((t + 1) - update_after) % update_every == 0: - # Step based learning rate decay + # Step based learning rate decay. if lr_decay_ref.lower() == "step": for scheduler in opt_schedulers: scheduler.step() policy.bound_lr( lr_a_final, lr_c_final, lr_a_final - ) # Make sure lr is bounded above the final lr + ) # Make sure lr is bounded above the final lr. for _ in range(steps_per_update): batch = replay_buffer.sample_batch(batch_size) update_diagnostics = policy.update(data=batch) - logger.store(**update_diagnostics) # Log diagnostics + logger.store(**update_diagnostics) # Log diagnostics. - # SGD batch tb logging + # SGD batch tb logging. if use_tensorboard and not tb_low_log_freq: logger.log_to_tb(keys=diag_tb_log_list, global_step=t) @@ -1112,11 +1112,11 @@ def sac( # noqa: C901 if (t + 1) % steps_per_epoch == 0: epoch = (t + 1) // steps_per_epoch - # Save model + # Save model. if (epoch % save_freq == 0) or (epoch == epochs): logger.save_state({"env": env}, itr=epoch) - # Test the performance of the deterministic version of the agent + # Test the performance of the deterministic version of the agent. if num_test_episodes != 0: eps_ret, eps_len = test_agent( policy, test_env, num_test_episodes, max_ep_len=max_ep_len @@ -1127,15 +1127,15 @@ def sac( # noqa: C901 extend=True, ) - # Epoch based learning rate decay + # Epoch based learning rate decay. if lr_decay_ref.lower() != "step": for scheduler in opt_schedulers: scheduler.step() policy.bound_lr( lr_a_final, lr_c_final, lr_a_final - ) # Make sure lr is bounded above the final lr + ) # Make sure lr is bounded above the final lr. - # Log performance measure to ray tuning + # Log performance measure to ray tuning. # NOTE: Only executed when the ray tuner invokes the script if hasattr(tune, "session") and tune.session._session is not None: mean_ep_ret = logger.get_stats("EpRet") @@ -1144,7 +1144,7 @@ def sac( # noqa: C901 mean_ep_ret=mean_ep_ret[0], epoch=epoch, mean_ep_len=mean_ep_len[0] ) - # Log info about epoch + # Log info about epoch. logger.log_tabular("Epoch", epoch) logger.log_tabular("TotalEnvInteracts", t) logger.log_tabular( @@ -1443,7 +1443,7 @@ def sac( # noqa: C901 ), ) - # Parse logger related arguments + # Parse logger related arguments. parser.add_argument( "--exp_name", type=str, @@ -1502,7 +1502,7 @@ def sac( # noqa: C901 ) args = parser.parse_args() - # Setup actor critic arguments + # Setup actor critic arguments. output_activation = {} output_activation["actor"] = safer_eval(args.act_out_a, backend="torch") output_activation["critic"] = safer_eval(args.act_out_c, backend="torch") @@ -1518,7 +1518,7 @@ def sac( # noqa: C901 output_activation=output_activation, ) - # Setup output dir for logger and return output kwargs + # Setup output dir for logger and return output kwargs. logger_kwargs = setup_logger_kwargs( args.exp_name, args.seed, diff --git a/stable_learning_control/control/algos/tf2/__init__.py b/stable_learning_control/control/algos/tf2/__init__.py index bad9aabe1..ab985d609 100644 --- a/stable_learning_control/control/algos/tf2/__init__.py +++ b/stable_learning_control/control/algos/tf2/__init__.py @@ -1,6 +1,6 @@ """Contains the Tensorflow 2.x implementations of the RL/IL algorithms. """ -# Put algorithms on namespace for easy loading in the test_policy utility +# Put algorithms on namespace for easy loading in the test_policy utility. from stable_learning_control.control.algos.tf2.lac.lac import LAC from stable_learning_control.control.algos.tf2.sac.sac import SAC diff --git a/stable_learning_control/control/algos/tf2/common/get_lr_scheduler.py b/stable_learning_control/control/algos/tf2/common/get_lr_scheduler.py index 476906bd3..d6945c764 100644 --- a/stable_learning_control/control/algos/tf2/common/get_lr_scheduler.py +++ b/stable_learning_control/control/algos/tf2/common/get_lr_scheduler.py @@ -48,4 +48,4 @@ def get_lr_scheduler(decaying_lr_type, lr_start, lr_final, steps): return lr_scheduler else: - return lambda step: lr_start # Return a constant learning rate + return lambda step: lr_start # Return a constant learning rate. diff --git a/stable_learning_control/control/algos/tf2/common/helpers.py b/stable_learning_control/control/algos/tf2/common/helpers.py index 00c029c0b..6162b19ed 100644 --- a/stable_learning_control/control/algos/tf2/common/helpers.py +++ b/stable_learning_control/control/algos/tf2/common/helpers.py @@ -20,7 +20,7 @@ def set_device(device_type="cpu"): str: The type of device that is used. """ if device_type.lower() == "cpu": - tf.config.set_visible_devices([], "GPU") # Force disable GPU + tf.config.set_visible_devices([], "GPU") # Force disable GPU. log_to_std_out(f"Tensorflow is using the {device_type.upper()}.", type="info") return device_type.lower() diff --git a/stable_learning_control/control/algos/tf2/lac/lac.py b/stable_learning_control/control/algos/tf2/lac/lac.py index fe84dde69..21634243c 100644 --- a/stable_learning_control/control/algos/tf2/lac/lac.py +++ b/stable_learning_control/control/algos/tf2/lac/lac.py @@ -58,15 +58,15 @@ nn = import_tf(module_name="tensorflow.nn") Adam = import_tf(module_name="tensorflow.keras.optimizers", class_name="Adam") -# Import ray tuner if installed +# Import ray tuner if installed. tune = lazy_importer(module_name="ray.tune") -# Script settings +# Script settings. SCALE_LAMBDA_MIN_MAX = ( 0.0, 1.0, -) # Range of lambda lagrance multiplier -SCALE_ALPHA_MIN_MAX = (0.0, np.inf) # Range of alpha lagrance multiplier +) # Range of lambda lagrance multiplier. +SCALE_ALPHA_MIN_MAX = (0.0, np.inf) # Range of alpha lagrance multiplier. STD_OUT_LOG_VARS_DEFAULT = [ "Epoch", "TotalEnvInteracts", @@ -212,7 +212,7 @@ def __init__( # noqa: C901 } self._was_build = False - # Validate gymnasium env + # Validate gymnasium env. # NOTE: The current implementation only works with continuous spaces. if not is_gym_env(env): raise ValueError("Env must be a valid gymnasium environment.") @@ -248,7 +248,7 @@ def __init__( # noqa: C901 type="info", ) - # Store algorithm parameters + # Store algorithm parameters. self._act_dim = env.action_space.shape self._obs_dim = env.observation_space.shape self._device = set_device(device) @@ -267,7 +267,7 @@ def __init__( # noqa: C901 else: self._target_entropy = target_entropy - # Create variables for the Lagrance multipliers + # Create variables for the Lagrance multipliers. # NOTE: Clip at 1e-37 to prevent log_alpha/log_lambda from becoming -np.inf self.log_alpha = tf.Variable( tf.math.log(1e-37 if alpha < 1e-37 else alpha), name="log_alpha" @@ -290,7 +290,7 @@ def __init__( # noqa: C901 self._init_targets() - # Create optimizers + # Create optimizers. # NOTE: We here optimize for log_alpha and log_labda instead of alpha and labda # because it is more numerically stable (see: # https://github.com/rail-berkeley/softlearning/issues/136) @@ -370,11 +370,11 @@ def update(self, data): o_ ) # NOTE: Target actions come from *current* *target* policy l_pi_targ = self.ac_targ.L([o_, pi_targ_]) - l_backup = r + self._gamma * (1 - d) * l_pi_targ # The Lyapunov candidate + l_backup = r + self._gamma * (1 - d) * l_pi_targ # The Lyapunov candidate. - # Compute Lyapunov Critic error gradients + # Compute Lyapunov Critic error gradients. with tf.GradientTape() as l_tape: - # Get current Lyapunov value + # Get current Lyapunov value. l1 = self.ac.L([o, a]) # Calculate Lyapunov *CRITIC* error @@ -391,7 +391,7 @@ def update(self, data): ################################################ # Optimize Gaussian actor ###################### ################################################ - # Compute actor loss gradients + # Compute actor loss gradients. with tf.GradientTape() as a_tape: # Retrieve log probabilities of batch observations based on *current* policy _, logp_pi = self.ac.pi(o) @@ -404,11 +404,11 @@ def update(self, data): "if you need this." ) - # Get target lyapunov value + # Get target lyapunov value. pi_, _ = self.ac.pi(o_) # NOTE: Target actions come from *current* policy lya_l_ = self.ac.L([o_, pi_]) - # Compute Lyapunov Actor error + # Compute Lyapunov Actor error. l_delta = tf.reduce_mean( lya_l_ - tf.stop_gradient(l1) + self._alpha3 * r ) # See Han eq. 11 @@ -429,9 +429,9 @@ def update(self, data): # Optimize alpha (Entropy temperature) ######### ################################################ if self._adaptive_temperature: - # Compute alpha loss gradients + # Compute alpha loss gradients. with tf.GradientTape() as alpha_tape: - # Calculate alpha loss + # Calculate alpha loss. alpha_loss = -tf.reduce_mean( self.alpha * tf.stop_gradient(logp_pi + self.target_entropy) ) # See Haarnoja eq. 17 @@ -447,9 +447,9 @@ def update(self, data): # Optimize labda (Lyapunov temperature) ######## ################################################ - # Compute labda loss gradients + # Compute labda loss gradients. with tf.GradientTape() as lambda_tape: - # Calculate labda loss + # Calculate labda loss. # NOTE: Log_labda was used in the lambda_loss function because using # lambda caused the gradients to vanish. This is caused since we # restrict lambda within a 0-1.0 range using the clamp function @@ -496,7 +496,7 @@ def save(self, path, checkpoint_name="checkpoint"): except Exception as e: raise Exception("LAC model could not be saved.") from e - # Save additional information + # Save additional information. save_info = { "alg_name": self.__class__.__name__, "setup_kwargs": self._setup_kwargs, @@ -528,7 +528,7 @@ def restore(self, path, restore_lagrance_multipliers=False): "path and try again." ) - # Store initial values in order to ignore them when loading the weights + # Store initial values in order to ignore them when loading the weights. lr_a = self._lr_a.value() lr_alpha = self._lr_alpha.value() lr_lag = self._lr_lag.value() @@ -576,7 +576,7 @@ def export(self, path): obs_dummy = tf.random.uniform( combine_shapes(1, self._obs_dim), dtype=tf.float32 ) - self.ac.pi.get_action(obs_dummy) # Make sure the full graph was traced + self.ac.pi.get_action(obs_dummy) # Make sure the full graph was traced. self.ac.pi.save(osp.join(path, "tf2_save")) def build(self): @@ -912,7 +912,7 @@ def lac( # noqa: C901 env = env_fn() - # Validate gymnasium env + # Validate gymnasium env. # NOTE: The current implementation only works with continuous spaces. if not is_gym_env(env): raise ValueError("Env must be a valid gymnasium environment.") @@ -959,9 +959,9 @@ def lac( # noqa: C901 hyper_paramet_dict = { k: v for k, v in locals().items() if k not in ["logger"] } # Retrieve hyperparameters (Ignore logger object) - logger.save_config(hyper_paramet_dict) # Write hyperparameters to logger + logger.save_config(hyper_paramet_dict) # Write hyperparameters to logger. - # Retrieve max episode length + # Retrieve max episode length. if max_ep_len is None: max_ep_len = env.env._max_episode_steps else: @@ -982,7 +982,7 @@ def lac( # noqa: C901 # Get default actor critic if no 'actor_critic' was supplied actor_critic = LyapunovActorCritic if actor_critic is None else actor_critic - # Set random seed for reproducible results + # Set random seed for reproducible results. if seed is not None: os.environ["PYTHONHASHSEED"] = str(seed) os.environ["TF_CUDNN_DETERMINISTIC"] = "1" # new flag present in tf 2.0+ @@ -1007,13 +1007,13 @@ def lac( # noqa: C901 device, ) - # Create learning rate schedulers + # Create learning rate schedulers. # NOTE: Alpha and labda currently use the same scheduler as the actor. lr_decay_ref_var = total_steps if lr_decay_ref.lower() == "steps" else epochs lr_a_scheduler = get_lr_scheduler(lr_decay_type, lr_a, lr_a_final, lr_decay_ref_var) lr_c_scheduler = get_lr_scheduler(lr_decay_type, lr_c, lr_c_final, lr_decay_ref_var) - # Restore policy if supplied + # Restore policy if supplied. if start_policy is not None: logger.log(f"Restoring model from '{start_policy}'.", type="info") try: @@ -1035,7 +1035,7 @@ def lac( # noqa: C901 size=replay_size, ) - # Count variables and print network structure + # Count variables and print network structure. var_counts = tuple(count_vars(module) for module in [policy.ac.pi, policy.ac.L]) logger.log("Number of parameters: \t pi: %d, \t L: %d\n" % var_counts, type="info") logger.log("Network structure:\n", type="info") @@ -1043,7 +1043,7 @@ def lac( # noqa: C901 logger.setup_tf_saver(policy) - # Setup diagnostics tb_write dict and store initial learning rates + # Setup diagnostics tb_write dict and store initial learning rates. diag_tb_log_list = [ "ErrorL", "LossPi", @@ -1092,7 +1092,7 @@ def lac( # noqa: C901 else: a = env.action_space.sample() - # Take step in the env + # Take step in the env. o_, r, d, truncated, _ = env.step(a) ep_ret += r ep_len += 1 @@ -1102,22 +1102,22 @@ def lac( # noqa: C901 # Make sure to update most recent observation! o = o_ - # End of trajectory handling + # End of trajectory handling. if d or truncated: logger.store(EpRet=ep_ret, EpLen=ep_len) o, _ = env.reset() ep_ret, ep_len = 0, 0 - # Update handling + # Update handling. if (t + 1) >= update_after and ((t + 1) - update_after) % update_every == 0: - # Step based learning rate decay + # Step based learning rate decay. if lr_decay_ref.lower() == "step": lr_a_now = max( lr_a_scheduler(t + 1), lr_a_final - ) # Make sure lr is bounded above final lr + ) # Make sure lr is bounded above final lr. lr_c_now = max( lr_c_scheduler(t + 1), lr_c_final - ) # Make sure lr is bounded above final lr + ) # Make sure lr is bounded above final lr. policy.set_learning_rates( lr_a=lr_a_now, lr_c=lr_c_now, lr_alpha=lr_a_now, lr_labda=lr_a_now ) @@ -1125,8 +1125,8 @@ def lac( # noqa: C901 for _ in range(steps_per_update): batch = replay_buffer.sample_batch(batch_size) update_diagnostics = policy.update(data=batch) - logger.store(**update_diagnostics) # Log diagnostics - # SGD batch tb logging + logger.store(**update_diagnostics) # Log diagnostics. + # SGD batch tb logging. if use_tensorboard and not tb_low_log_freq: logger.log_to_tb(keys=diag_tb_log_list, global_step=t) @@ -1134,11 +1134,11 @@ def lac( # noqa: C901 if (t + 1) % steps_per_epoch == 0: epoch = (t + 1) // steps_per_epoch - # Save model + # Save model. if (epoch % save_freq == 0) or (epoch == epochs): logger.save_state({"env": env}, itr=epoch) - # Test the performance of the deterministic version of the agent + # Test the performance of the deterministic version of the agent. if num_test_episodes != 0: eps_ret, eps_len = test_agent( policy, test_env, num_test_episodes, max_ep_len=max_ep_len @@ -1149,19 +1149,19 @@ def lac( # noqa: C901 extend=True, ) - # Epoch based learning rate decay + # Epoch based learning rate decay. if lr_decay_ref.lower() != "step": lr_a_now = max( lr_a_scheduler(epoch), lr_a_final - ) # Make sure lr is bounded above final + ) # Make sure lr is bounded above final. lr_c_now = max( lr_c_scheduler(epoch), lr_c_final - ) # Make sure lr is bounded above final + ) # Make sure lr is bounded above final. policy.set_learning_rates( lr_a=lr_a_now, lr_c=lr_c_now, lr_alpha=lr_a_now, lr_labda=lr_a_now ) - # Log performance measure to ray tuning + # Log performance measure to ray tuning. # NOTE: Only executed when the ray tuner invokes the script if hasattr(tune, "session") and tune.session._session is not None: mean_ep_ret = logger.get_stats("EpRet") @@ -1170,7 +1170,7 @@ def lac( # noqa: C901 mean_ep_ret=mean_ep_ret[0], epoch=epoch, mean_ep_len=mean_ep_len[0] ) - # Log info about epoch + # Log info about epoch. logger.log_tabular("Epoch", epoch) logger.log_tabular("TotalEnvInteracts", t) logger.log_tabular( @@ -1490,7 +1490,7 @@ def lac( # noqa: C901 ), ) - # Parse logger related arguments + # Parse logger related arguments. parser.add_argument( "--exp_name", type=str, @@ -1549,7 +1549,7 @@ def lac( # noqa: C901 ) args = parser.parse_args() - # Setup actor critic arguments + # Setup actor critic arguments. output_activation = {} output_activation["actor"] = safer_eval(args.act_out_a, backend="tf") ac_kwargs = dict( @@ -1564,7 +1564,7 @@ def lac( # noqa: C901 output_activation=output_activation, ) - # Setup output dir for logger and return output kwargs + # Setup output dir for logger and return output kwargs. logger_kwargs = setup_logger_kwargs( args.exp_name, args.seed, diff --git a/stable_learning_control/control/algos/tf2/policies/actors/squashed_gaussian_actor.py b/stable_learning_control/control/algos/tf2/policies/actors/squashed_gaussian_actor.py index 2843b7ada..87bc2bcb6 100644 --- a/stable_learning_control/control/algos/tf2/policies/actors/squashed_gaussian_actor.py +++ b/stable_learning_control/control/algos/tf2/policies/actors/squashed_gaussian_actor.py @@ -113,7 +113,7 @@ def call(self, obs, deterministic=False, with_logprob=True): - pi_action (:obj:`tensorflow.Tensor`): The actions given by the policy. - logp_pi (:obj:`tensorflow.Tensor`): The log probabilities of each of these actions. """ # noqa: E501 - # Calculate mean action and standard deviation + # Calculate mean action and standard deviation. net_out = self.net(obs) mu = self.mu_layer(net_out) log_std = self.log_std_layer(net_out) @@ -127,19 +127,19 @@ def call(self, obs, deterministic=False, with_logprob=True): if deterministic: pi_action = mu # determinestic action used at test time. else: - # Sample from the normal distribution and calculate the action + # Sample from the normal distribution and calculate the action. batch_size = tf.shape(input=obs)[0] epsilon = self._normal_distribution.sample(batch_size) pi_action = affine_bijector.forward( epsilon - ) # Transform action as it was sampled from the policy distribution + ) # Transform action as it was sampled from the policy distribution. # Squash the action between (-1 and 1) pi_action = self._squash_bijector.forward(pi_action) # Compute logprob from Gaussian, and then apply correction for Tanh squashing. if with_logprob: - # Transform base_distribution to the policy distribution + # Transform base_distribution to the policy distribution. reparm_trick_bijector = tfp.bijectors.Chain( (self._squash_bijector, affine_bijector) ) @@ -150,7 +150,7 @@ def call(self, obs, deterministic=False, with_logprob=True): else: logp_pi = None - # Clamp the actions such that they are in range of the environment + # Clamp the actions such that they are in range of the environment. if self.act_limits is not None: pi_action = clamp( pi_action, diff --git a/stable_learning_control/control/algos/tf2/sac/sac.py b/stable_learning_control/control/algos/tf2/sac/sac.py index 0a5107b8f..9006bb28d 100644 --- a/stable_learning_control/control/algos/tf2/sac/sac.py +++ b/stable_learning_control/control/algos/tf2/sac/sac.py @@ -58,15 +58,15 @@ nn = import_tf(module_name="tensorflow.nn") Adam = import_tf(module_name="tensorflow.keras.optimizers", class_name="Adam") -# Import ray tuner if installed +# Import ray tuner if installed. tune = lazy_importer(module_name="ray.tune") -# Script settings +# Script settings. SCALE_LAMBDA_MIN_MAX = ( 0.0, 1.0, -) # Range of lambda lagrance multiplier -SCALE_ALPHA_MIN_MAX = (0.0, np.inf) # Range of alpha lagrance multiplier +) # Range of lambda lagrance multiplier. +SCALE_ALPHA_MIN_MAX = (0.0, np.inf) # Range of alpha lagrance multiplier. STD_OUT_LOG_VARS_DEFAULT = [ "Epoch", "TotalEnvInteracts", @@ -204,7 +204,7 @@ def __init__( # noqa: C901 } self._was_build = False - # Validate gymnasium env + # Validate gymnasium env. # NOTE: The current implementation only works with continuous spaces. if not is_gym_env(env): raise ValueError("Env must be a valid gymnasium environment.") @@ -240,7 +240,7 @@ def __init__( # noqa: C901 type="info", ) - # Store algorithm parameters + # Store algorithm parameters. self._act_dim = env.action_space.shape self._obs_dim = env.observation_space.shape self._device = set_device(device) @@ -257,7 +257,7 @@ def __init__( # noqa: C901 else: self._target_entropy = target_entropy - # Create variables for the Lagrance multipliers + # Create variables for the Lagrance multipliers. # NOTE: Clip at 1e-37 to prevent log_alpha/log_lambda from becoming -np.inf self.log_alpha = tf.Variable( tf.math.log(1e-37 if alpha < 1e-37 else alpha), name="log_alpha" @@ -277,7 +277,7 @@ def __init__( # noqa: C901 self._init_targets() - # Create optimizers + # Create optimizers. # NOTE: We here optimize for log_alpha instead of alpha because it is more # numerically stable (see: # https://github.com/rail-berkeley/softlearning/issues/136) @@ -354,7 +354,7 @@ def update(self, data): o_ ) # NOTE: Target actions coming from *current* policy - # Get target Q values based on optimization type + # Get target Q values based on optimization type. q1_pi_targ = self.ac_targ.Q1([o_, pi_]) q2_pi_targ = self.ac_targ.Q2([o_, pi_]) if self._opt_type.lower() == "minimize": @@ -365,12 +365,12 @@ def update(self, data): else: q_pi_targ = tf.math.minimum( q1_pi_targ, q2_pi_targ - ) # Use min clipping to prevent overestimation bias + ) # Use min clipping to prevent overestimation bias. q_backup = r + self._gamma * (1 - d) * (q_pi_targ - self.alpha * logp_pi_) # Compute the Q-Critic loss gradients with tf.GradientTape() as q_tape: - # Retrieve the current Q values + # Retrieve the current Q values. q1 = self.ac.Q1([o, a]) q2 = self.ac.Q2([o, a]) @@ -387,12 +387,12 @@ def update(self, data): ################################################ # Optimize Gaussian actor ###################### ################################################ - # Compute actor loss gradients + # Compute actor loss gradients. with tf.GradientTape() as a_tape: # Retrieve log probabilities of batch observations based on *current* policy pi, logp_pi = self.ac.pi(o) - # Retrieve current Q values + # Retrieve current Q values. # NOTE: Actions come from *current* policy q1_pi = self.ac.Q1([o, pi]) q2_pi = self.ac.Q2([o, pi]) @@ -420,9 +420,9 @@ def update(self, data): # Optimize alpha (Entropy temperature) ######### ################################################ if self._adaptive_temperature: - # Compute alpha loss gradients + # Compute alpha loss gradients. with tf.GradientTape() as alpha_tape: - # Calculate alpha loss + # Calculate alpha loss. alpha_loss = -tf.reduce_mean( self.alpha * tf.stop_gradient(logp_pi + self.target_entropy) ) # See Haarnoja eq. 17 @@ -468,7 +468,7 @@ def save(self, path, checkpoint_name="checkpoint"): except Exception as e: raise Exception("SAC model could not be saved.") from e - # Save additional information + # Save additional information. save_info = { "alg_name": self.__class__.__name__, "setup_kwargs": self._setup_kwargs, @@ -500,7 +500,7 @@ def restore(self, path, restore_lagrance_multipliers=False): "path and try again." ) - # Store initial values in order to ignore them when loading the weights + # Store initial values in order to ignore them when loading the weights. lr_a = self._lr_a.value() lr_alpha = self._lr_alpha.value() lr_c = self._lr_c.value() @@ -544,7 +544,7 @@ def export(self, path): obs_dummy = tf.random.uniform( combine_shapes(1, self._obs_dim), dtype=tf.float32 ) - self.ac.pi.get_action(obs_dummy) # Make sure the full graph was traced + self.ac.pi.get_action(obs_dummy) # Make sure the full graph was traced. self.ac.pi.save(osp.join(path, "tf2_save")) def build(self): @@ -858,7 +858,7 @@ def sac( # noqa: C901 env = env_fn() - # Validate gymnasium env + # Validate gymnasium env. # NOTE: The current implementation only works with continuous spaces. if not is_gym_env(env): raise ValueError("Env must be a valid gymnasium environment.") @@ -905,9 +905,9 @@ def sac( # noqa: C901 hyper_paramet_dict = { k: v for k, v in locals().items() if k not in ["logger"] } # Retrieve hyperparameters (Ignore logger object) - logger.save_config(hyper_paramet_dict) # Write hyperparameters to logger + logger.save_config(hyper_paramet_dict) # Write hyperparameters to logger. - # Retrieve max episode length + # Retrieve max episode length. if max_ep_len is None: max_ep_len = env.env._max_episode_steps else: @@ -928,7 +928,7 @@ def sac( # noqa: C901 # Get default actor critic if no 'actor_critic' was supplied actor_critic = SoftActorCritic if actor_critic is None else actor_critic - # Set random seed for reproducible results + # Set random seed for reproducible results. if seed is not None: os.environ["PYTHONHASHSEED"] = str(seed) os.environ["TF_CUDNN_DETERMINISTIC"] = "1" # new flag present in tf 2.0+ @@ -951,13 +951,13 @@ def sac( # noqa: C901 device, ) - # Create learning rate schedulers + # Create learning rate schedulers. # NOTE: Alpha currently uses the same scheduler as the actor. lr_decay_ref_var = total_steps if lr_decay_ref.lower() == "steps" else epochs lr_a_scheduler = get_lr_scheduler(lr_decay_type, lr_a, lr_a_final, lr_decay_ref_var) lr_c_scheduler = get_lr_scheduler(lr_decay_type, lr_c, lr_c_final, lr_decay_ref_var) - # Restore policy if supplied + # Restore policy if supplied. if start_policy is not None: logger.log(f"Restoring model from '{start_policy}'.", type="info") try: @@ -979,7 +979,7 @@ def sac( # noqa: C901 size=replay_size, ) - # Count variables and print network structure + # Count variables and print network structure. var_counts = tuple( count_vars(module) for module in [policy.ac.pi, policy.ac.Q1, policy.ac.Q2] ) @@ -992,7 +992,7 @@ def sac( # noqa: C901 logger.setup_tf_saver(policy) - # Setup diagnostics tb_write dict and store initial learning rates + # Setup diagnostics tb_write dict and store initial learning rates. diag_tb_log_list = ["LossQ", "LossPi", "Alpha", "LossAlpha", "Entropy"] if use_tensorboard: logger.log_to_tb( @@ -1027,7 +1027,7 @@ def sac( # noqa: C901 else: a = env.action_space.sample() - # Take step in the env + # Take step in the env. o_, r, d, truncated, _ = env.step(a) ep_ret += r ep_len += 1 @@ -1037,22 +1037,22 @@ def sac( # noqa: C901 # Make sure to update most recent observation! o = o_ - # End of trajectory handling + # End of trajectory handling. if d or truncated: logger.store(EpRet=ep_ret, EpLen=ep_len) o, _ = env.reset() ep_ret, ep_len = 0, 0 - # Update handling + # Update handling. if (t + 1) >= update_after and ((t + 1) - update_after) % update_every == 0: - # Step based learning rate decay + # Step based learning rate decay. if lr_decay_ref.lower() == "step": lr_a_now = max( lr_a_scheduler(t + 1), lr_a_final - ) # Make sure lr is bounded above final lr + ) # Make sure lr is bounded above final lr. lr_c_now = max( lr_c_scheduler(t + 1), lr_c_final - ) # Make sure lr is bounded above final lr + ) # Make sure lr is bounded above final lr. policy.set_learning_rates( lr_a=lr_a_now, lr_c=lr_c_now, lr_alpha=lr_a_now ) @@ -1060,9 +1060,9 @@ def sac( # noqa: C901 for _ in range(steps_per_update): batch = replay_buffer.sample_batch(batch_size) update_diagnostics = policy.update(data=batch) - logger.store(**update_diagnostics) # Log diagnostics + logger.store(**update_diagnostics) # Log diagnostics. - # SGD batch tb logging + # SGD batch tb logging. if use_tensorboard and not tb_low_log_freq: logger.log_to_tb(keys=diag_tb_log_list, global_step=t) @@ -1070,11 +1070,11 @@ def sac( # noqa: C901 if (t + 1) % steps_per_epoch == 0: epoch = (t + 1) // steps_per_epoch - # Save model + # Save model. if (epoch % save_freq == 0) or (epoch == epochs): logger.save_state({"env": env}, itr=epoch) - # Test the performance of the deterministic version of the agent + # Test the performance of the deterministic version of the agent. if num_test_episodes != 0: eps_ret, eps_len = test_agent( policy, test_env, num_test_episodes, max_ep_len=max_ep_len @@ -1085,19 +1085,19 @@ def sac( # noqa: C901 extend=True, ) - # Epoch based learning rate decay + # Epoch based learning rate decay. if lr_decay_ref.lower() != "step": lr_a_now = max( lr_a_scheduler(epoch), lr_a_final - ) # Make sure lr is bounded above final + ) # Make sure lr is bounded above final. lr_c_now = max( lr_c_scheduler(epoch), lr_c_final - ) # Make sure lr is bounded above final + ) # Make sure lr is bounded above final. policy.set_learning_rates( lr_a=lr_a_now, lr_c=lr_c_now, lr_alpha=lr_a_now ) - # Log performance measure to ray tuning + # Log performance measure to ray tuning. # NOTE: Only executed when the ray tuner invokes the script if hasattr(tune, "session") and tune.session._session is not None: mean_ep_ret = logger.get_stats("EpRet") @@ -1106,7 +1106,7 @@ def sac( # noqa: C901 mean_ep_ret=mean_ep_ret[0], epoch=epoch, mean_ep_len=mean_ep_len[0] ) - # Log info about epoch + # Log info about epoch. logger.log_tabular("Epoch", epoch) logger.log_tabular("TotalEnvInteracts", t) logger.log_tabular( @@ -1405,7 +1405,7 @@ def sac( # noqa: C901 ), ) - # Parse logger related arguments + # Parse logger related arguments. parser.add_argument( "--exp_name", type=str, @@ -1464,7 +1464,7 @@ def sac( # noqa: C901 ) args = parser.parse_args() - # Setup actor critic arguments + # Setup actor critic arguments. output_activation = {} output_activation["actor"] = safer_eval(args.act_out_a, backend="tf") output_activation["critic"] = safer_eval(args.act_out_c, backend="tf") @@ -1480,7 +1480,7 @@ def sac( # noqa: C901 output_activation=output_activation, ) - # Setup output dir for logger and return output kwargs + # Setup output dir for logger and return output kwargs. logger_kwargs = setup_logger_kwargs( args.exp_name, args.seed, diff --git a/stable_learning_control/control/common/buffers.py b/stable_learning_control/control/common/buffers.py index d8da36452..0b8fa8e8b 100644 --- a/stable_learning_control/control/common/buffers.py +++ b/stable_learning_control/control/common/buffers.py @@ -172,7 +172,7 @@ def __init__( type="warning", ) - # Main buffers + # Main buffers. self.obs_buf = atleast_2d( np.zeros(combine_shapes(size, obs_dim), dtype=np.float32).squeeze() ) @@ -187,13 +187,13 @@ def __init__( ).squeeze() self.done_buf = np.zeros(int(size), dtype=np.float32) - # Optional buffers + # Optional buffers. self.adv_buf = np.zeros(size, dtype=np.float32) self.ret_buf = np.zeros(size, dtype=np.float32) self.val_buf = np.zeros(size, dtype=np.float32) self.logp_buf = np.zeros(size, dtype=np.float32) - # Store buffer attributes + # Store buffer attributes. self.ptr, self.traj_ptr, self.n_traj, self._max_size = 0, 0, 0, size self.traj_ptrs = [] self.traj_lengths = [] @@ -216,9 +216,9 @@ def store(self, obs, act, rew, next_obs, done, val=None, logp=None): # noqa: C9 logp (numpy.ndarray, optional): The log probabilities of the actions. Defaults to ``None``. """ - assert self.ptr < self._max_size # buffer has to have room so you can store + assert self.ptr < self._max_size # buffer has to have room so you can store. - # Fill primary buffer + # Fill primary buffer. try: self.obs_buf[self.ptr] = obs self.obs_next_buf[self.ptr] = next_obs @@ -254,7 +254,7 @@ def store(self, obs, act, rew, next_obs, done, val=None, logp=None): # noqa: C9 ) raise ValueError(error_msg) - # Fill optional buffer + # Fill optional buffer. if val: try: self.val_buf[self.ptr] = val @@ -276,7 +276,7 @@ def store(self, obs, act, rew, next_obs, done, val=None, logp=None): # noqa: C9 raise ValueError(error_msg) self._contains_logp = True - # Increase buffer pointers + # Increase buffer pointers. self.ptr += 1 def finish_path(self, last_val=0): @@ -300,7 +300,7 @@ def finish_path(self, last_val=0): # Calculate the advantage and rewards-to-go if buffer contains vals if self._contains_vals: - # Get the current trajectory + # Get the current trajectory. path_slice = slice(self.traj_ptr, self.ptr) rews = np.append(self.rew_buf[path_slice], last_val) vals = np.append(self.val_buf[path_slice], last_val) @@ -312,7 +312,7 @@ def finish_path(self, last_val=0): # the next line computes rewards-to-go, to be targets for the value function self.ret_buf[path_slice] = discount_cumsum(rews, self._gamma)[:-1] - # Store trajectory length and update trajectory pointers + # Store trajectory length and update trajectory pointers. self.traj_lengths.append(self.ptr - self.traj_ptr) self.traj_ptrs.append(self.traj_ptr) self.traj_ptr = self.ptr @@ -337,10 +337,10 @@ def get(self, flat=False): # noqa: C901 Returns: dict: The trajectory buffer. """ - if not self._preempt: # Check if buffer was full + if not self._preempt: # Check if buffer was full. assert self.ptr == self._max_size - # Remove incomplete trajectories + # Remove incomplete trajectories. if not self._incomplete and self.traj_ptr != self.ptr: if not self._incomplete_warn: log_to_std_out( @@ -353,7 +353,7 @@ def get(self, flat=False): # noqa: C901 else: buffer_end_ptr = self.ptr - # Remove trajectories that are to short + # Remove trajectories that are to short. if self.traj_lengths[-1] < self._min_traj_size: if not self._min_traj_size_warn: log_to_std_out( @@ -367,7 +367,7 @@ def get(self, flat=False): # noqa: C901 buffer_end_ptr = self.traj_ptr - self.traj_lengths[-1] self.traj_lengths = self.traj_lengths[:-1] - # Create trajectory buffer dictionary + # Create trajectory buffer dictionary. buff_slice = slice(0, buffer_end_ptr) if flat: data = dict( @@ -396,9 +396,9 @@ def get(self, flat=False): # noqa: C901 if self._contains_logp: data["lopg"] = np.split(self.logp_buf[buff_slice], self.traj_ptrs[1:]) - # Reset buffer and traj indexes + # Reset buffer and traj indexes. self.ptr, self.traj_ptr, self.traj_ptrs, self.n_traj = 0, 0, [], 0 self.traj_lengths = [] - # Return experience tuple + # Return experience tuple. return data diff --git a/stable_learning_control/control/common/helpers.py b/stable_learning_control/control/common/helpers.py index d857bdfdd..639e18e8d 100644 --- a/stable_learning_control/control/common/helpers.py +++ b/stable_learning_control/control/common/helpers.py @@ -74,7 +74,7 @@ def get_activation_function(activation_fn_name, backend="torch"): else: backend_prefix = ["torch", "nn"] - # Retrieve activation function + # Retrieve activation function. if len(activation_fn_name.split(".")) == 1: activation_fn_name = ".".join(backend_prefix) + "." + activation_fn_name elif len(activation_fn_name.split(".")) == 2: diff --git a/stable_learning_control/control/utils/eval_robustness.py b/stable_learning_control/control/utils/eval_robustness.py index 4c93d497a..7fe06a703 100644 --- a/stable_learning_control/control/utils/eval_robustness.py +++ b/stable_learning_control/control/utils/eval_robustness.py @@ -170,7 +170,7 @@ def run_disturbed_policy( # noqa: C901 verbose_fmt="table", output_dir=output_dir, output_fname="eval_statistics.csv" ) - # Increase action space + # Increase action space. # NOTE: Needed to prevent the disturbance from being clipped by the action space. env.unwrapped.action_space.high = np.array( [np.finfo(np.float32).max for item in env.unwrapped.action_space.high] @@ -179,7 +179,7 @@ def run_disturbed_policy( # noqa: C901 [np.finfo(np.float32).min for item in env.unwrapped.action_space.low] ) - # Increase max episode length if requested + # Increase max episode length if requested. if max_ep_len is None: max_ep_len = env._max_episode_steps else: @@ -195,7 +195,7 @@ def run_disturbed_policy( # noqa: C901 ) env._max_episode_steps = max_ep_len - # Try to retrieve default type and variant if not supplied + # Try to retrieve default type and variant if not supplied. if disturbance_type is None: if hasattr(env.unwrapped, "_disturber_cfg"): if "default_type" in env.unwrapped._disturber_cfg.keys(): @@ -247,14 +247,14 @@ def run_disturbed_policy( # noqa: C901 env.disturbance_info["type"] if hasattr(env, "disturbance_info") and "type" in env.disturbance_info.keys() else disturbance_type - ) # Retrieve used disturbance type + ) # Retrieve used disturbance type. disturbance_variant = ( env.disturbance_info["variant"] if hasattr(env, "disturbance_info") and "variant" in env.disturbance_info.keys() else disturbance_variant - ) # Retrieve used disturbance variant + ) # Retrieve used disturbance variant. - # Loop though all disturbances till disturber is done + # Loop though all disturbances till disturber is done. logger.log("Starting robustness evaluation...", type="info") render_error = False path = { @@ -272,11 +272,11 @@ def run_disturbed_policy( # noqa: C901 ) = ([], [], [], []) n_disturbance = 0 soi_found, ref_found = True, True - supports_deterministic = True # Only supported with gaussian algorithms + supports_deterministic = True # Only supported with gaussian algorithms. while not env.disturber_done: o, r, d, ep_ret, ep_len, n = env.reset(), 0, False, 0, 0, 0 while n < num_episodes: - # Render env if requested + # Render env if requested. if render and not render_error: try: env.render() @@ -292,7 +292,7 @@ def run_disturbed_policy( # noqa: C901 type="warning", ) - # Retrieve action + # Retrieve action. if deterministic and supports_deterministic: try: a = policy.get_action(o, deterministic=deterministic) @@ -343,7 +343,7 @@ def run_disturbed_policy( # noqa: C901 type="warning", ) - # Store performance measurements + # Store performance measurements. if d or (ep_len == max_ep_len): died = ep_len < max_ep_len logger.store(EpRet=ep_ret, EpLen=ep_len, DeathRate=(float(died))) @@ -352,24 +352,24 @@ def run_disturbed_policy( # noqa: C901 % (n, ep_ret, ep_len, died) ) - # Store observations + # Store observations. o_episode_df = pd.DataFrame(path["o"]) o_episode_df.insert(0, "step", range(0, ep_len)) o_episode_df = pd.melt( o_episode_df, id_vars="step", var_name="observation", - ) # Flatten dataframe + ) # Flatten dataframe. o_episodes_dfs.append(o_episode_df) - # Store episode rewards + # Store episode rewards. r_episode_df = pd.DataFrame( {"step": range(0, ep_len), "reward": path["r"]} ) r_episode_df.insert(len(r_episode_df.columns), "episode", n) r_episodes_dfs.append(r_episode_df) - # Store states of interest + # Store states of interest. if soi_found: soi_episode_df = pd.DataFrame(path["state_of_interest"]) soi_episode_df.insert(0, "step", range(0, ep_len)) @@ -378,10 +378,10 @@ def run_disturbed_policy( # noqa: C901 id_vars="step", var_name="state_of_interest", value_name="error", - ) # Flatten dataframe + ) # Flatten dataframe. soi_episodes_dfs.append(soi_episode_df) - # Store reference + # Store reference. if ref_found: ref_episode_df = pd.DataFrame(path["reference"]) ref_episode_df.insert(0, "step", range(0, ep_len)) @@ -389,10 +389,10 @@ def run_disturbed_policy( # noqa: C901 ref_episode_df, id_vars="step", var_name="reference", - ) # Flatten dataframe + ) # Flatten dataframe. ref_episodes_dfs.append(ref_episode_df) - # Increment counters and reset storage variables + # Increment counters and reset storage variables. n += 1 o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 path = { @@ -402,7 +402,7 @@ def run_disturbed_policy( # noqa: C901 "state_of_interest": [], } - # Print robustness evaluation diagnostics + # Print robustness evaluation diagnostics. if hasattr(env, "disturbance_info") and "type" in env.disturbance_info.keys(): logger.log_tabular( "DisturbanceType", @@ -441,7 +441,7 @@ def run_disturbed_policy( # noqa: C901 log_to_std_out("") logger.dump_tabular() - # Add extra disturbance information to the robustness eval dataframe + # Add extra disturbance information to the robustness eval dataframe. disturbance_label = ( env.disturbance_info["label"] if ( @@ -483,7 +483,7 @@ def run_disturbed_policy( # noqa: C901 ) ref_disturbances_dfs.append(ref_disturbance_df) - # Reset storage buckets and go to next disturbance + # Reset storage buckets and go to next disturbance. o_episodes_dfs = [] r_episodes_dfs = [] soi_episodes_dfs = [] @@ -491,7 +491,7 @@ def run_disturbed_policy( # noqa: C901 env.next_disturbance() n_disturbance += 1 - # Merge robustness evaluation information for all disturbances + # Merge robustness evaluation information for all disturbances. o_disturbances_df = pd.concat(o_disturbances_dfs, ignore_index=True) r_disturbances_df = pd.concat(r_disturbances_dfs, ignore_index=True) soi_disturbances_df = pd.concat(soi_disturbances_dfs, ignore_index=True) @@ -526,7 +526,7 @@ def run_disturbed_policy( # noqa: C901 disturbance_variant, ) - # Save robustness evaluation dataframe and return it to the user + # Save robustness evaluation dataframe and return it to the user. if save_result: results_path = logger.output_dir.joinpath("results.csv") logger.log( @@ -583,7 +583,7 @@ def plot_robustness_results( # noqa: C901 else None ) - # Unpack required data from dataframe + # Unpack required data from dataframe. obs_found, rew_found, soi_found, ref_found = True, True, True, True o_disturbances_df, ref_disturbances_df = pd.DataFrame(), pd.DataFrame() if "observation" in dataframe["variable"].unique(): @@ -611,13 +611,13 @@ def plot_robustness_results( # noqa: C901 else: ref_found = False - # Merge observations and references + # Merge observations and references. if obs_found: obs_df_tmp = o_disturbances_df.copy(deep=True) obs_df_tmp["signal"] = "obs_" + (obs_df_tmp["observation"] + 1).astype(str) obs_df_tmp.insert(len(obs_df_tmp.columns), "type", "observation") - # Retrieve the requested observations + # Retrieve the requested observations. observations = validate_observations(observations, o_disturbances_df) observations = [obs - 1 for obs in observations] # Humans count from 1 obs_df_tmp = obs_df_tmp.query(f"observation in {observations}") @@ -627,7 +627,7 @@ def plot_robustness_results( # noqa: C901 ref_df_tmp.insert(len(ref_df_tmp.columns), "type", "reference") obs_ref_df = pd.concat([obs_df_tmp, ref_df_tmp], ignore_index=True) - # Loop though all disturbances and plot the observations and references in one plot + # Loop though all disturbances and plot the observations and references in one plot. fig_title = "{} under several {}{}.".format( "Observation and reference" if all([obs_found, ref_found]) @@ -640,7 +640,7 @@ def plot_robustness_results( # noqa: C901 obs_ref_df.loc[obs_ref_df["disturbance_index"] == 0, "disturbance"] = ( obs_ref_df.loc[obs_ref_df["disturbance_index"] == 0, "disturbance"] + " (original)" - ) # Append original to original value + ) # Append original to original value. if not merged: num_plots = len(obs_ref_df.disturbance.unique()) total_cols = 3 @@ -676,7 +676,7 @@ def plot_robustness_results( # noqa: C901 ).set_title(fig_title) figs["observations"].append(fig) - # Plot mean cost + # Plot mean cost. if rew_found: fig = plt.figure(tight_layout=True) figs["costs"].append(fig) @@ -687,7 +687,7 @@ def plot_robustness_results( # noqa: C901 r_disturbances_df["disturbance_index"] == 0, "disturbance" ] + " (original)" - ) # Append original to original value + ) # Append original to original value. sns.lineplot( data=r_disturbances_df, x="step", y="reward", ci="sd", hue="disturbance" ).set_title( @@ -707,7 +707,7 @@ def plot_robustness_results( # noqa: C901 type="warning", ) - # Plot states of interest + # Plot states of interest. if soi_found: n_soi = soi_disturbances_df["state_of_interest"].max() + 1 soi_disturbances_df.loc[ @@ -717,7 +717,7 @@ def plot_robustness_results( # noqa: C901 soi_disturbances_df["disturbance_index"] == 0, "disturbance" ] + " (original)" - ) # Append original to original value + ) # Append original to original value. for index in range(0, n_soi): fig = plt.figure(tight_layout=True) figs["states_of_interest"].append(fig) @@ -746,7 +746,7 @@ def plot_robustness_results( # noqa: C901 type="warning", ) - # Save plots + # Save plots. if save_figs: figs_path = output_dir.joinpath("figures") figs_extension = figs_fmt[1:] if figs_fmt.startswith(".") else figs_fmt @@ -901,10 +901,10 @@ def plot_robustness_results( # noqa: C901 ) args = parser.parse_args() - # Load policy and environment + # Load policy and environment. env, policy = load_policy_and_env(args.fpath, args.itr if args.itr >= 0 else "last") - # List d_type or d_variant if requested + # List d_type or d_variant if requested. if args.list_disturbance_types or args.list_disturbance_variants: if hasattr(env.unwrapped, "_disturber_cfg"): if args.list_disturbance_types: @@ -981,11 +981,11 @@ def plot_robustness_results( # noqa: C901 log_to_std_out(friendly_err(error_msg)) sys.exit() - # Retrieve output_dir + # Retrieve output_dir. if not args.data_dir: args.data_dir = args.fpath - # Perform robustness evaluation + # Perform robustness evaluation. run_results_df = run_disturbed_policy( env, policy, diff --git a/stable_learning_control/control/utils/eval_utils.py b/stable_learning_control/control/utils/eval_utils.py index 2d4304740..4e685d83b 100644 --- a/stable_learning_control/control/utils/eval_utils.py +++ b/stable_learning_control/control/utils/eval_utils.py @@ -23,7 +23,7 @@ def test_agent(policy, env, num_episodes, max_ep_len=None): o, _ = env.reset() d, truncated, ep_ret, ep_len = False, False, 0, 0 while not (d or truncated): - # Take deterministic actions at test time + # Take deterministic actions at test time. o, r, d, truncated, _ = env.step(policy.get_action(o, True)) ep_ret += r ep_len += 1 diff --git a/stable_learning_control/control/utils/gym_utils.py b/stable_learning_control/control/utils/gym_utils.py index f13d40b4a..3062417e5 100644 --- a/stable_learning_control/control/utils/gym_utils.py +++ b/stable_learning_control/control/utils/gym_utils.py @@ -67,7 +67,7 @@ def validate_gym_env(arg_dict): AssertError: Raised when a environment is supplied that is not a valid gymnasium environment. """ - # Import gymnasium environments + # Import gymnasium environments. # import gymnasium as gym # Import environment configuration file. This file can be used to inject diff --git a/stable_learning_control/control/utils/plot.py b/stable_learning_control/control/utils/plot.py index 89252c5fb..03890c028 100644 --- a/stable_learning_control/control/utils/plot.py +++ b/stable_learning_control/control/utils/plot.py @@ -195,18 +195,18 @@ def get_all_datasets(all_logdirs, legend=None, select=None, exclude=None): if exclude is not None: logdirs = [log for log in logdirs if all(not (x in log) for x in exclude)] - # Verify logdirs + # Verify logdirs. print("Plotting from...\n" + "=" * DIV_LINE_WIDTH + "\n") for logdir in logdirs: print(logdir) print("\n" + "=" * DIV_LINE_WIDTH) - # Make sure the legend is compatible with the logdirs + # Make sure the legend is compatible with the logdirs. assert not (legend) or (len(legend) == len(logdirs)), friendly_err( "Must give a legend title for each set of experiments." ) - # Load data from logdirs + # Load data from logdirs. data = [] if legend: for log, leg in zip(logdirs, legend): diff --git a/stable_learning_control/control/utils/run_utils.py b/stable_learning_control/control/utils/run_utils.py index acd96554f..ec34d0582 100644 --- a/stable_learning_control/control/utils/run_utils.py +++ b/stable_learning_control/control/utils/run_utils.py @@ -71,10 +71,10 @@ def call_experiment( datestamp (bool): Whether a datestamp should be added to the experiment name. kwargs: All kwargs to pass to thunk. """ - # Determine number of CPU cores to run on + # Determine number of CPU cores to run on. num_cpu = psutil.cpu_count(logical=False) if num_cpu == "auto" else num_cpu - # Send random seed to thunk + # Send random seed to thunk. kwargs["seed"] = seed # Be friendly and print out your kwargs, so we all know what's up @@ -85,7 +85,7 @@ def call_experiment( print(json.dumps(kwargs_json, separators=(",", ":\t"), indent=4, sort_keys=True)) print("\n") - # Set up logger output directory + # Set up logger output directory. if "logger_kwargs" not in kwargs: kwargs["logger_kwargs"] = setup_logger_kwargs( exp_name, seed, data_dir, datestamp @@ -101,7 +101,7 @@ def call_experiment( **kwargs["logger_kwargs"], ) - # Force algorithm default if verbose_fmt is line + # Force algorithm default if verbose_fmt is line. # NOTE: Done since otherwise the stdout gets cluttered. if kwargs["logger_kwargs"]["verbose_fmt"] == "line": kwargs["logger_kwargs"]["verbose_vars"] = None @@ -110,7 +110,7 @@ def thunk_plus(): """Setup environment used in the experiment.""" # Make 'env_fn' from 'env_name' if "env_name" in kwargs: - # Import gymnasium environments + # Import gymnasium environments. import gymnasium as gym # Import environment configuration file. This file can be used to inject @@ -129,13 +129,13 @@ def thunk_plus(): env_kwargs = kwargs.pop("env_kwargs", {}) kwargs["env_fn"] = lambda: gym.make(env_name, **env_kwargs) - # Fork into multiple processes + # Fork into multiple processes. mpi_fork(num_cpu) - # Run thunk + # Run thunk. thunk(**kwargs) - # Prepare to launch a script to run the experiment + # Prepare to launch a script to run the experiment. pickled_thunk = cloudpickle.dumps(thunk_plus) encoded_thunk = base64.b64encode(zlib.compress(pickled_thunk)).decode("utf-8") @@ -374,10 +374,10 @@ def get_val(v, k): param_name = sh if sh is not None else k param_name = valid_str(param_name) - # Get variant value for parameter k + # Get variant value for parameter k. variant_val = get_val(variant, k) - # Append to name + # Append to name. if all_bools(v): # If this is a param which only takes boolean values, # only include in the name if it's True for this variant. diff --git a/stable_learning_control/control/utils/safer_eval.py b/stable_learning_control/control/utils/safer_eval.py index e0a1bad31..15cae9707 100644 --- a/stable_learning_control/control/utils/safer_eval.py +++ b/stable_learning_control/control/utils/safer_eval.py @@ -11,7 +11,7 @@ .. autofunction:: safer_eval """ # NOTE: Manual autofunction request was added because of bug https://github.com/sphinx-doc/sphinx/issues/7912#issuecomment-786011464 # noqa:E501 -# Import modules to which you want users to have access +# Import modules to which you want users to have access. import torch # noqa: F401 import stable_learning_control as stable_learning_control # noqa: F401 @@ -34,7 +34,7 @@ def safer_eval(*args, backend=None): args: The eval return values. """ - # Import the nn module based on the backend type + # Import the nn module based on the backend type. # NOTE: This was done to enable users to specify `nn.relu` instead of # `torch.nn.ReLu`. if backend is not None and backend.lower() in ["torch", "pytorch"]: diff --git a/stable_learning_control/control/utils/test_policy.py b/stable_learning_control/control/utils/test_policy.py index c30a80bfb..7dd4c7b06 100644 --- a/stable_learning_control/control/utils/test_policy.py +++ b/stable_learning_control/control/utils/test_policy.py @@ -143,7 +143,7 @@ def load_policy_and_env(fpath, itr="last"): ) ) - # Retrieve model path and backend + # Retrieve model path and backend. fpath, backend = _retrieve_model_folder(fpath) if itr != "last": @@ -152,7 +152,7 @@ def load_policy_and_env(fpath, itr="last"): ) itr = "%d" % itr - # try to load environment from save + # try to load environment from save. # NOTE: Sometimes this will fail because the environment could not be pickled. try: state = joblib.load(Path(fpath).parent.joinpath("vars.pkl")) @@ -169,7 +169,7 @@ def load_policy_and_env(fpath, itr="last"): ) ) from e - # load the get_action function + # load the get_action function. try: if backend == "tf": policy = load_tf_policy(fpath, env=env, itr=itr) @@ -205,11 +205,11 @@ def load_tf_policy(fpath, env, itr="last"): model_path = _retrieve_iter_folder(fpath, itr) else: model_path = fpath - tf = import_tf() # Import tf if installed otherwise throw warning + tf = import_tf() # Import tf if installed otherwise throw warning. print("\n") log_to_std_out("Loading model from '%s'.\n\n" % fpath, type="info") - # Retrieve get_action method + # Retrieve get_action method. save_info = load_from_json(Path(fpath).joinpath("save_info.json")) import stable_learning_control.control.algos.tf2 as tf2_algos @@ -218,7 +218,7 @@ def load_tf_policy(fpath, env, itr="last"): except KeyError: ac_kwargs = {} model = getattr(tf2_algos, save_info["alg_name"])(env=env, **ac_kwargs) - latest = tf.train.latest_checkpoint(model_path) # Restore latest checkpoint + latest = tf.train.latest_checkpoint(model_path) # Restore latest checkpoint. model.load_weights(latest) return model @@ -246,7 +246,7 @@ def load_pytorch_policy(fpath, env, itr="last"): print("\n") log_to_std_out("Loading model from '%s'.\n\n" % model_file, type="info") - # Retrieve get_action method + # Retrieve get_action method. save_info = load_from_json(Path(fpath).joinpath("save_info.json")) import stable_learning_control.control.algos.pytorch as torch_algos @@ -256,7 +256,7 @@ def load_pytorch_policy(fpath, env, itr="last"): except KeyError: ac_kwargs = {} model = getattr(torch_algos, save_info["alg_name"])(env=env, **ac_kwargs) - model.load_state_dict(model_data) # Retore model parameters + model.load_state_dict(model_data) # Retore model parameters. return model @@ -290,10 +290,10 @@ def run_policy( logger = EpochLogger(verbose_fmt="table") o, r, d, ep_ret, ep_len, n = env.reset(), 0, False, 0, 0, 0 - supports_deterministic = True # Only supported with gaussian algorithms + supports_deterministic = True # Only supported with gaussian algorithms. render_error = False while n < num_episodes: - # Render env if requested + # Render env if requested. if render and not render_error: try: env.render() @@ -308,7 +308,7 @@ def run_policy( type="warning", ) - # Retrieve action + # Retrieve action. if deterministic and supports_deterministic: try: a = policy.get_action(o, deterministic=deterministic) @@ -324,7 +324,7 @@ def run_policy( else: a = policy.get_action(o) - # Perform action in the environment and store result + # Perform action in the environment and store result. o, r, d, _ = env.step(a) ep_ret += r ep_len += 1 diff --git a/stable_learning_control/env_config.py b/stable_learning_control/env_config.py index c589ca44c..9b1101b97 100644 --- a/stable_learning_control/env_config.py +++ b/stable_learning_control/env_config.py @@ -11,7 +11,7 @@ .. code-block:: python - # Import environments you want to use + # Import environments you want to use. import custom_environment_1 import custom_environment_2 """ # noqa: E501 diff --git a/stable_learning_control/run.py b/stable_learning_control/run.py index 612b4c066..7e14b0f90 100644 --- a/stable_learning_control/run.py +++ b/stable_learning_control/run.py @@ -90,9 +90,9 @@ def _parse_exp_cfg(cmd_line_args): # noqa: C901 if "--exp_cfg" in cmd_line_args: cfg_error = False exp_cfg_idx = cmd_line_args.index("--exp_cfg") - cmd_line_args.pop(exp_cfg_idx) # Remove exp_cfg argument + cmd_line_args.pop(exp_cfg_idx) # Remove exp_cfg argument. - # Validate config path + # Validate config path. try: exp_cfg_file_path = cmd_line_args.pop(exp_cfg_idx) exp_cfg_file_path = ( @@ -128,9 +128,9 @@ def _parse_exp_cfg(cmd_line_args): # noqa: C901 type="warning", ) - # Read configuration values + # Read configuration values. if not cfg_error: - # Load exp config + # Load exp config. with open(exp_cfg_file_path) as stream: try: exp_cfg_params = yaml.safe_load(stream) @@ -142,7 +142,7 @@ def _parse_exp_cfg(cmd_line_args): # noqa: C901 type="warning", ) - # Retrieve values from exp config + # Retrieve values from exp config. log_to_std_out( f"Experiment hyperparameters loaded from '{exp_cfg_file_path}'", type="info", @@ -154,14 +154,14 @@ def _parse_exp_cfg(cmd_line_args): # noqa: C901 type="warning", ) else: - # Retrieve algorithm if not supplied by user + # Retrieve algorithm if not supplied by user. if exp_cfg_idx == 1: if "alg_name" in exp_cfg_params.keys(): cmd_line_args.insert(1, exp_cfg_params.pop("alg_name", None)) else: exp_cfg_params.pop("alg_name") - # Append cfg hyperparameters to input arguments + # Append cfg hyperparameters to input arguments. # NOTE: Here we assume comma or space separated strings to be variants. exp_cfg_params = { (key if key.startswith("--") else "--" + key): val @@ -257,7 +257,7 @@ def _parse_and_execute_grid_search(cmd, args): # noqa: C901 """ cmd, backend = _add_backend_to_cmd(cmd) - # warning + # warning. algo = safer_eval("stable_learning_control.control." + cmd, backend=backend) # Before all else, check to see if any of the flags is 'help'. @@ -309,12 +309,12 @@ def _parse_and_execute_grid_search(cmd, args): # noqa: C901 # These special shortcuts are described by SUBSTITUTIONS. for special_name, true_name in SUBSTITUTIONS.items(): if special_name in arg_dict: - # swap it in arg dict + # swap it in arg dict. arg_dict[true_name] = arg_dict[special_name] del arg_dict[special_name] if special_name in given_shorthands: - # point the shortcut to the right name + # point the shortcut to the right name. given_shorthands[true_name] = given_shorthands[special_name] del given_shorthands[special_name] diff --git a/stable_learning_control/user_config.py b/stable_learning_control/user_config.py index b6344b03a..6ec57d5bd 100644 --- a/stable_learning_control/user_config.py +++ b/stable_learning_control/user_config.py @@ -8,8 +8,7 @@ import os.path as osp -# Default neural network backend for each algo -# (Must be either 'tf1' or 'pytorch') +# Default neural network backend for each algo (Must be either 'tf1' or 'pytorch'). DEFAULT_BACKEND = { "lac": "pytorch", "sac": "pytorch", @@ -25,11 +24,11 @@ # Whether GridSearch provides automatically-generated default shorthands: DEFAULT_SHORTHAND = True -# Tells the GridSearch how many seconds to pause for before launching +# Tells the GridSearch how many seconds to pause for before launching. # experiments. WAIT_BEFORE_LAUNCH = 5 -# Logger std out output type +# Logger std out output type. # NOTE:The format in which the statistics are displayed to the terminal. Options are # "table" which supplies them as a table and "line" which prints them in one line DEFAULT_STD_OUT_TYPE = "line" diff --git a/stable_learning_control/utils/log_utils/helpers.py b/stable_learning_control/utils/log_utils/helpers.py index 72fed0323..42b4f830b 100644 --- a/stable_learning_control/utils/log_utils/helpers.py +++ b/stable_learning_control/utils/log_utils/helpers.py @@ -57,7 +57,7 @@ def colorize(string, color, bold=False, highlight=False): Returns: str: Colorized string. """ - if color: # If not empty + if color: # If not empty. return gym_colorize(string, color, bold, highlight) else: return string @@ -172,10 +172,10 @@ def setup_logger_kwargs( A dict containing output_dir and exp_name. """ - # Datestamp forcing + # Datestamp forcing. datestamp = datestamp or FORCE_DATESTAMP - # Make base path + # Make base path. ymd_time = time.strftime("%Y-%m-%d_") if datestamp else "" relpath = "".join([ymd_time, exp_name]) diff --git a/stable_learning_control/utils/log_utils/logx.py b/stable_learning_control/utils/log_utils/logx.py index 44e754eea..6a6d6bf09 100644 --- a/stable_learning_control/utils/log_utils/logx.py +++ b/stable_learning_control/utils/log_utils/logx.py @@ -89,7 +89,7 @@ def __init__( exp_name (str): Experiment name. """ if proc_id() == 0: - # Parse output_fname to see if csv was requested + # Parse output_fname to see if csv was requested. extension = osp.splitext(output_fname)[1] self._output_csv = True if extension.lower() == ".csv" else False @@ -130,10 +130,10 @@ def __init__( self.tb_writer = None self._tabular_to_tb_dict = ( dict() - ) # Stores whether tabular is logged to tensorboard when dump_tabular is called + ) # Stores if tabular is logged to tensorboard when dump_tabular is called. self._step_count_dict = ( dict() - ) # Used for keeping count of the current global step + ) # Used for keeping count of the current global step. def log( self, @@ -185,7 +185,7 @@ def log_to_tb(self, key, val, tb_prefix=None, tb_alias=None, global_step=None): global_step (int, optional): Global step value to record. Uses internal step counter if global step is not supplied. """ - self.use_tensorboard = True # Make sure SummaryWriter exists + self.use_tensorboard = True # Make sure SummaryWriter exists. var_name = tb_alias if tb_alias is not None else key var_name = tb_prefix + "/" + var_name if tb_prefix is not None else var_name self._write_to_tb(var_name, val, global_step=global_step) @@ -258,7 +258,7 @@ def dump_tabular(self, global_step=None): # noqa: C901 print_keys = [] print_vals = [] - # Retrieve data from current row + # Retrieve data from current row. for key in self._log_headers: val = self._log_current_row.get(key, "") valstr = ( @@ -271,12 +271,12 @@ def dump_tabular(self, global_step=None): # noqa: C901 print_dict[key] = valstr vals.append(val) - # Log to stdout + # Log to stdout. if self.verbose: if self.verbose_vars: key_filter = self.verbose_vars - # Make sure Epcoh and EnvInteract are always shown if present + # Make sure Epcoh and EnvInteract are always shown if present. for item in reversed(["Epoch", "TotalEnvInteracts"]): if item not in key_filter and item in print_keys: key_filter.insert(0, item) @@ -313,7 +313,7 @@ def dump_tabular(self, global_step=None): # noqa: C901 ] ) self.log(print_str) - else: # Increase epoch steps and time on the same line + else: # Increase epoch steps and time on the same line. self.log( "\r{}: {:8.3G}, {}: {:8.3g}, {}: {:8.3G} s".format( "Epoch", @@ -326,18 +326,18 @@ def dump_tabular(self, global_step=None): # noqa: C901 end="", ) - # Log to file + # Log to file. if self.output_file is not None: if self._first_row: self.output_file.write("\t".join(self._log_headers) + "\n") self.output_file.write("\t".join(map(str, vals)) + "\n") self.output_file.flush() - # Write tabular to tensorboard log + # Write tabular to tensorboard log. for key in self._log_headers: if self._tabular_to_tb_dict[key]["tb_write"]: val = self._log_current_row.get(key, "") - # Use internal counter if global_step is None + # Use internal counter if global_step is None. if global_step is None: if key in self._log_headers: global_step = self._global_step @@ -457,7 +457,7 @@ def load_env(cls, env_path): load_path = load_path[0] else: load_path = env_path - # try to load environment from save + # try to load environment from save. # NOTE: Sometimes this will fail because the environment could not be pickled. try: state = joblib.load(load_path) @@ -530,7 +530,7 @@ def save_state(self, state_dict, itr=None): except (ValueError, pickle.PicklingError): self.log("Warning: could not pickle state_dict.", color="red") - # Save model state + # Save model state. if hasattr(self, "tf_saver_elements"): backend_folder_name = "tf2_save" self._tf_save(itr) @@ -538,7 +538,7 @@ def save_state(self, state_dict, itr=None): backend_folder_name = "torch_save" self._pytorch_save(itr) - # Save checkpoint state + # Save checkpoint state. if self._save_checkpoints and itr is not None: itr_name = ( "iter%d" % itr @@ -563,7 +563,7 @@ def setup_tf_saver(self, what_to_save): Tensorflow models. """ global tf - tf = import_tf() # Import tf if installed otherwise throw warning + tf = import_tf() # Import tf if installed otherwise throw warning. self.tf_saver_elements = what_to_save self.log("Policy will be saved to '{}'.\n".format(self.output_dir), type="info") @@ -596,12 +596,12 @@ def _tf_save(self, itr=None): self, "tf_saver_elements" ), "First have to setup saving with self.setup_tf_saver" - # Create filename + # Create filename. fpath = osp.join(self.output_dir, "tf2_save") fname = osp.join(fpath, "weights_checkpoint") os.makedirs(fpath, exist_ok=True) - # Create Checkpoints name + # Create Checkpoints name. if self._save_checkpoints and itr is not None: itr_name = ( "iter%d" % itr @@ -612,7 +612,7 @@ def _tf_save(self, itr=None): cname = osp.join(cpath, "weights_checkpoint") os.makedirs(cpath, exist_ok=True) - # Save additional algorithm information + # Save additional algorithm information. if not self._save_info_saved: save_info = { "alg_name": self.tf_saver_elements.__class__.__name__, @@ -626,7 +626,7 @@ def _tf_save(self, itr=None): ) self._save_info_saved = True - # Save model + # Save model. if isinstance(self.tf_saver_elements, tf.keras.Model) or hasattr( self.tf_saver_elements, "save_weights" ): @@ -634,7 +634,7 @@ def _tf_save(self, itr=None): else: self.log(save_fail_warning, type="warning") - # Save checkpoint + # Save checkpoint. if self._save_checkpoints and itr is not None: if isinstance(self.tf_saver_elements, tf.keras.Model) or hasattr( self.tf_saver_elements, "save_weights" @@ -643,7 +643,7 @@ def _tf_save(self, itr=None): else: self.log(save_fail_warning, type="warning") - self._checkpoint += 1 # Increase epoch + self._checkpoint += 1 # Increase epoch. def _pytorch_save(self, itr=None): """Saves the PyTorch model/models using their ``state_dict``. @@ -664,12 +664,12 @@ def _pytorch_save(self, itr=None): self, "pytorch_saver_elements" ), "First have to setup saving with self.setup_pytorch_saver" - # Create filename + # Create filename. fpath = osp.join(self.output_dir, "torch_save") fname = osp.join(fpath, "model_state.pt") os.makedirs(fpath, exist_ok=True) - # Create Checkpoints Name + # Create Checkpoints Name. if self._save_checkpoints and itr is not None: itr_name = ( "iter%d" % itr @@ -680,7 +680,7 @@ def _pytorch_save(self, itr=None): cname = osp.join(cpath, "model_state.pt") os.makedirs(cpath, exist_ok=True) - # Save additional algorithm information + # Save additional algorithm information. if not self._save_info_saved: save_info = { "alg_name": self.pytorch_saver_elements.__class__.__name__, @@ -696,7 +696,7 @@ def _pytorch_save(self, itr=None): ) self._save_info_saved = True - # Save model + # Save model. if isinstance(self.pytorch_saver_elements, torch.nn.Module) or hasattr( self.pytorch_saver_elements, "state_dict" ): @@ -704,7 +704,7 @@ def _pytorch_save(self, itr=None): else: self.log(save_fail_warning, type="warning") - # Save checkpoint + # Save checkpoint. if self._save_checkpoints: if isinstance(self.pytorch_saver_elements, torch.nn.Module) or hasattr( self.pytorch_saver_elements, "state_dict" @@ -713,7 +713,7 @@ def _pytorch_save(self, itr=None): else: self.log(save_fail_warning, type="warning") - self._checkpoint += 1 # Increase epoch + self._checkpoint += 1 # Increase epoch. def _write_to_tb(self, var_name, data, global_step=None): """Writes data to tensorboard log file. @@ -729,13 +729,13 @@ def _write_to_tb(self, var_name, data, global_step=None): counter if global step is not supplied. """ - # Try to write data to tb as as historgram + # Try to write data to tb as as historgram. if not self.tb_writer: self.use_tensorboard = ( - True # Property that creates tf writer if set to True + True # Property that creates tf writer if set to True. ) - if is_scalar(data): # Extra protection since trying to write a list freezes tb - try: # Try to write as scalar + if is_scalar(data): # Extra protection since trying to write a list freezes tb. + try: # Try to write as scalar. self.add_scalar(var_name, data, global_step=global_step) except ( AssertionError, @@ -746,7 +746,7 @@ def _write_to_tb(self, var_name, data, global_step=None): ): pass else: - # Try to write data to tb as as historgram + # Try to write data to tb as as historgram. try: self.add_histogram(var_name, data, global_step=global_step) except ( @@ -758,7 +758,7 @@ def _write_to_tb(self, var_name, data, global_step=None): ): pass - # Try to write data as image + # Try to write data as image. try: self.add_image(var_name, data, global_step=global_step) except ( @@ -801,11 +801,11 @@ def use_tensorboard(self, value): """ self._use_tensorboard = value - # Create tensorboard writer if use_tensorboard == True else delete - if self._use_tensorboard and not self.tb_writer: # Create writer object + # Create tensorboard writer if use_tensorboard == True else delete. + if self._use_tensorboard and not self.tb_writer: # Create writer object. if self._use_tf_backend: self.log("Using Tensorflow as the Tensorboard backend.", type="info") - tf = import_tf() # Import tf if installed otherwise throw warning + tf = import_tf() # Import tf if installed otherwise throw warning. self.tb_writer = tf.summary.create_file_writer(self.output_dir) else: self.log( @@ -818,10 +818,10 @@ def use_tensorboard(self, value): comment=f"{exp_name.upper()}-data_" + time.strftime("%Y%m%d-%H%M%S"), ) - atexit.register(self.tb_writer.close) # Make sure the writer is closed - elif not self._use_tensorboard and self.tb_writer: # Delete tensorboard writer - self.tb_writer.close() # Close writer - atexit.unregister(self.tb_writer.close) # Make sure the writer is closed + atexit.register(self.tb_writer.close) # Make sure the writer is closed. + elif not self._use_tensorboard and self.tb_writer: # Delete tensorboard writer. + self.tb_writer.close() # Close writer. + atexit.unregister(self.tb_writer.close) # Make sure the writer is closed. self.tb_writer = None @property @@ -849,7 +849,7 @@ def add_hparams(self, *args, **kwargs): NotImplementedError: Raised if you try to call this method when using the Tensorflow backend. """ - self.use_tensorboard = True # Make sure SummaryWriter exists + self.use_tensorboard = True # Make sure SummaryWriter exists. if self._use_tf_backend: raise NotImplementedError( "The 'add_hparams' method is not available when using the 'tensorflow' " @@ -869,7 +869,7 @@ def add_scalar(self, *args, **kwargs): *args: All args to pass to the Summary/SummaryWriter object. **kwargs: All kwargs to pass to the Summary/SummaryWriter object. """ - self.use_tensorboard = True # Make sure SummaryWriter exists + self.use_tensorboard = True # Make sure SummaryWriter exists. if self._use_tf_backend: kwargs["step"] = kwargs.pop("global_step") global tf @@ -903,7 +903,7 @@ def add_scalars(self, *args, **kwargs): NotImplementedError: Raised if you try to call this method when using the Tensorflow backend. """ - self.use_tensorboard = True # Make sure SummaryWriter exists + self.use_tensorboard = True # Make sure SummaryWriter exists. if self._use_tf_backend: raise NotImplementedError( "The 'add_scalars' method is not available when using the 'tensorflow' " @@ -924,7 +924,7 @@ def add_histogram(self, *args, **kwargs): *args: All args to pass to the Summary/SummaryWriter object. **kwargs: All kwargs to pass to the Summary/SummaryWriter object. """ - self.use_tensorboard = True # Make sure SummaryWriter exists + self.use_tensorboard = True # Make sure SummaryWriter exists. if self._use_tf_backend: kwargs["step"] = kwargs.pop("global_step") global tf @@ -959,7 +959,7 @@ def add_histogram_raw(self, *args, **kwargs): NotImplementedError: Raised if you try to call this method when using the Tensorflow backend. """ - self.use_tensorboard = True # Make sure SummaryWriter exists + self.use_tensorboard = True # Make sure SummaryWriter exists. if self._use_tf_backend: raise NotImplementedError( "The 'add_histogram_raw' method is not available when using the " @@ -979,7 +979,7 @@ def add_image(self, *args, **kwargs): *args: All args to pass to the Summary/SummaryWriter object. **kwargs: All kwargs to pass to the Summary/SummaryWriter object. """ - self.use_tensorboard = True # Make sure SummaryWriter exists + self.use_tensorboard = True # Make sure SummaryWriter exists. if self._use_tf_backend: kwargs["step"] = kwargs.pop("global_step") global tf @@ -1013,7 +1013,7 @@ def add_images(self, *args, **kwargs): NotImplementedError: Raised if you try to call this method when using the Tensorflow backend. """ - self.use_tensorboard = True # Make sure SummaryWriter exists + self.use_tensorboard = True # Make sure SummaryWriter exists. if self._use_tf_backend: raise NotImplementedError( "The 'add_images' method is not available when using the 'tensorflow' " @@ -1037,7 +1037,7 @@ def add_image_with_boxes(self, *args, **kwargs): NotImplementedError: Raised if you try to call this method when using the Tensorflow backend. """ - self.use_tensorboard = True # Make sure SummaryWriter exists + self.use_tensorboard = True # Make sure SummaryWriter exists. if self._use_tf_backend: raise NotImplementedError( "The 'add_image_with_boxes' method is not available when using the " @@ -1061,7 +1061,7 @@ def add_figure(self, *args, **kwargs): NotImplementedError: Raised if you try to call this method when using the Tensorflow backend. """ - self.use_tensorboard = True # Make sure SummaryWriter exists + self.use_tensorboard = True # Make sure SummaryWriter exists. if self._use_tf_backend: raise NotImplementedError( "The 'add_figure' method is not available when using the 'tensorflow' " @@ -1084,7 +1084,7 @@ def add_video(self, *args, **kwargs): NotImplementedError: Raised if you try to call this method when using the Tensorflow backend. """ - self.use_tensorboard = True # Make sure SummaryWriter exists + self.use_tensorboard = True # Make sure SummaryWriter exists. if self._use_tf_backend: raise NotImplementedError( "The 'add_video' method is not available when using the 'tensorflow' " @@ -1104,7 +1104,7 @@ def add_audio(self, *args, **kwargs): *args: All args to pass to the Summary/SummaryWriter object. **kwargs: All kwargs to pass to the Summary/SummaryWriter object. """ - self.use_tensorboard = True # Make sure SummaryWriter exists + self.use_tensorboard = True # Make sure SummaryWriter exists. if self._use_tf_backend: kwargs["step"] = kwargs.pop("global_step") global tf @@ -1136,7 +1136,7 @@ def add_text(self, *args, **kwargs): *args: All args to pass to the Summary/SummaryWriter object. **kwargs: All kwargs to pass to the Summary/SummaryWriter object. """ - self.use_tensorboard = True # Make sure SummaryWriter exists + self.use_tensorboard = True # Make sure SummaryWriter exists. if self._use_tf_backend: kwargs["step"] = kwargs.pop("global_step") global tf @@ -1171,7 +1171,7 @@ def add_onnx_graph(self, *args, **kwargs): NotImplementedError: Raised if you try to call this method when using the Tensorflow backend. """ - self.use_tensorboard = True # Make sure SummaryWriter exists + self.use_tensorboard = True # Make sure SummaryWriter exists. if self._use_tf_backend: raise NotImplementedError( "The 'add_onnx_graph' method is not available when using the " @@ -1194,7 +1194,7 @@ def add_graph(self, *args, **kwargs): NotImplementedError: Raised if you try to call this method when using the Tensorflow backend. """ - self.use_tensorboard = True # Make sure SummaryWriter exists + self.use_tensorboard = True # Make sure SummaryWriter exists. if self._use_tf_backend: raise NotImplementedError( "The 'add_graph' method is not available when using the 'tensorflow' " @@ -1218,7 +1218,7 @@ def add_embedding(self, *args, **kwargs): NotImplementedError: Raised if you try to call this method when using the Tensorflow backend. """ - self.use_tensorboard = True # Make sure SummaryWriter exists + self.use_tensorboard = True # Make sure SummaryWriter exists. if self._use_tf_backend: raise NotImplementedError( "The 'add_embedding' method is not available when using the " @@ -1241,7 +1241,7 @@ def add_pr_curve(self, *args, **kwargs): NotImplementedError: Raised if you try to call this method when using the Tensorflow backend. """ - self.use_tensorboard = True # Make sure SummaryWriter exists + self.use_tensorboard = True # Make sure SummaryWriter exists. if self._use_tf_backend: raise NotImplementedError( "The 'add_pr_curve' method is not available when using the " @@ -1264,7 +1264,7 @@ def add_pr_curve_raw(self, *args, **kwargs): NotImplementedError: Raised if you try to call this method when using the Tensorflow backend. """ - self.use_tensorboard = True # Make sure SummaryWriter exists + self.use_tensorboard = True # Make sure SummaryWriter exists. if self._use_tf_backend: raise NotImplementedError( "The 'add_pr_curve_raw' method is not available when using the " @@ -1290,7 +1290,7 @@ def add_custom_scalars_multilinechart(self, *args, **kwargs): NotImplementedError: Raised if you try to call this method when using the Tensorflow backend. """ - self.use_tensorboard = True # Make sure SummaryWriter exists + self.use_tensorboard = True # Make sure SummaryWriter exists. if self._use_tf_backend: raise NotImplementedError( "The 'add_custom_scalars_multilinechart' method is not available " @@ -1316,7 +1316,7 @@ def add_custom_scalars_marginchart(self, *args, **kwargs): NotImplementedError: Raised if you try to call this method when using the Tensorflow backend. """ - self.use_tensorboard = True # Make sure SummaryWriter exists + self.use_tensorboard = True # Make sure SummaryWriter exists. if self._use_tf_backend: raise NotImplementedError( "The 'add_custom_scalars_marginchart' method is not available when " @@ -1346,7 +1346,7 @@ def add_custom_scalars(self, *args, **kwargs): NotImplementedError: Raised if you try to call this method when using the Tensorflow backend. """ - self.use_tensorboard = True # Make sure SummaryWriter exists + self.use_tensorboard = True # Make sure SummaryWriter exists. if self._use_tf_backend: raise NotImplementedError( "The 'add_custom_scalars' method is not available when using the " @@ -1370,7 +1370,7 @@ def add_mesh(self, *args, **kwargs): NotImplementedError: Raised if you try to call this method when using the Tensorflow backend. """ - self.use_tensorboard = True # Make sure SummaryWriter exists + self.use_tensorboard = True # Make sure SummaryWriter exists. if self._use_tf_backend: raise NotImplementedError( "The 'add_mesh' method is not available when using the 'tensorflow' " @@ -1392,7 +1392,7 @@ def flush(self, *args, **kwargs): *args: All args to pass to the Summary/SummaryWriter object. **kwargs: All kwargs to pass to the Summary/SummaryWriter object. """ - self.use_tensorboard = True # Make sure SummaryWriter exists + self.use_tensorboard = True # Make sure SummaryWriter exists. if self._use_tf_backend: global tf with self.tb_writer.as_default(): @@ -1583,7 +1583,7 @@ def store( counter if global step is not supplied. """ for k, v in kwargs.items(): - # Store variable values in epoch_dict and increase global step count + # Store variable values in epoch_dict and increase global step count. if not (k in self.epoch_dict.keys()): self.epoch_dict[k] = [] self._step_count_dict[k] = 0 @@ -1592,19 +1592,19 @@ def store( else: self.epoch_dict[k].append(v) - # Increase the step count for all the keys + # Increase the step count for all the keys. # NOTE: This is done in such a way that two values of a given key do not - # get the same global step value assigned to them + # get the same global step value assigned to them. self._step_count_dict[k] = ( self._step_count_dict[k] + 1 if self._step_count_dict[k] + 1 >= self._global_step else self._global_step ) - # Check if a alias was given for the current parameter + # Check if a alias was given for the current parameter. var_name = k if k not in tb_aliases.keys() else tb_aliases[k] - # Write variable value to tensorboard + # Write variable value to tensorboard. tb_write_key = ( (tb_write[k] if k in tb_write.keys() else False) if isinstance(tb_write, dict) @@ -1613,7 +1613,7 @@ def store( if tb_write_key: global_step = ( global_step if global_step is not None else self._global_step - ) # Use internal counter if global_step is None + ) # Use internal counter if global_step is None. self._write_to_tb(var_name, v, global_step=global_step) def log_to_tb( @@ -1646,7 +1646,7 @@ def log_to_tb( global_step (int, optional): Global step value to record. Uses internal step counter if global step is not supplied. """ - if val is not None: # When key and value are supplied use direct write + if val is not None: # When key and value are supplied use direct write. super().log_to_tb( keys, val, @@ -1654,10 +1654,10 @@ def log_to_tb( tb_alias=tb_alias, global_step=global_step, ) - else: # When only keys are supplied use internal storage + else: # When only keys are supplied use internal storage. keys = [keys] if not isinstance(keys, list) else keys for key in keys: - if global_step is None: # Retrieve global step if not supplied + if global_step is None: # Retrieve global step if not supplied. if self._n_table_dumps >= 1: global_step_tmp = self._global_step elif key in self.epoch_dict.keys(): @@ -1767,7 +1767,7 @@ def dump_tabular(self, *args, **kwargs): self._n_table_dumps += 1 self._tb_index_dict = { key: 0 for key in self._tb_index_dict.keys() - } # Reset tensorboard logging index storage dict + } # Reset tensorboard logging index storage dict. def get_stats(self, key): """Lets an algorithm ask the logger for mean/std/min/max of a diagnostic. diff --git a/stable_learning_control/utils/mpi_utils/mpi_pytorch.py b/stable_learning_control/utils/mpi_utils/mpi_pytorch.py index 1f4a6b985..8e9fa89ed 100644 --- a/stable_learning_control/utils/mpi_utils/mpi_pytorch.py +++ b/stable_learning_control/utils/mpi_utils/mpi_pytorch.py @@ -39,15 +39,15 @@ def mpi_avg_grads(module): if num_procs() == 1: return - # Sync torch module parameters + # Sync torch module parameters. if hasattr(module, "parameters"): for p in module.parameters(): - # Sync network grads + # Sync network grads. p_grad_numpy = p.grad.numpy() avg_p_grad = mpi_avg(p.grad) p_grad_numpy[:] = avg_p_grad[:] elif isinstance(module, torch.Tensor): - # Sync network grads + # Sync network grads. p_grad_numpy = module.grad.numpy() avg_p_grad = mpi_avg(module.grad) if isinstance(avg_p_grad, list): @@ -72,19 +72,19 @@ def sync_params(module): if num_procs() == 1: return - # Sync torch module parameters + # Sync torch module parameters. if hasattr(module, "parameters"): - # Sync network parameters + # Sync network parameters. for p in module.parameters(): p_numpy = p.data.numpy() broadcast(p_numpy) elif isinstance(module, torch.Tensor): - # Sync pytorch parameter + # Sync pytorch parameter. p_numpy = module.data.numpy() broadcast(p_numpy) return elif isinstance(module, np.ndarray): - # Sync numpy parameters + # Sync numpy parameters. broadcast(module) else: raise TypeError( diff --git a/stable_learning_control/utils/mpi_utils/mpi_tools.py b/stable_learning_control/utils/mpi_utils/mpi_tools.py index 867febc92..c10d7eddd 100644 --- a/stable_learning_control/utils/mpi_utils/mpi_tools.py +++ b/stable_learning_control/utils/mpi_utils/mpi_tools.py @@ -154,7 +154,7 @@ def mpi_statistics_scalar(x, with_min_and_max=False): mean = global_sum / global_n global_sum_sq = mpi_sum(np.sum((x - mean) ** 2)) - std = np.sqrt(global_sum_sq / global_n) # compute global std + std = np.sqrt(global_sum_sq / global_n) # compute global std. if with_min_and_max: global_min = mpi_op(np.min(x) if len(x) > 0 else np.inf, op=MPI.MIN)