diff --git a/.github/labels.yml b/.github/labels.yml new file mode 100644 index 0000000..f7f83aa --- /dev/null +++ b/.github/labels.yml @@ -0,0 +1,66 @@ +--- +# Labels names are important as they are used by Release Drafter to decide +# regarding where to record them in changelog or if to skip them. +# +# The repository labels will be automatically configured using this file and +# the GitHub Action https://github.com/marketplace/actions/github-labeler. +- name: breaking + description: Breaking Changes + color: bfd4f2 +- name: bug + description: Something isn't working + color: d73a4a +- name: build + description: Build System and Dependencies + color: bfdadc +- name: ci + description: Continuous Integration + color: 4a97d6 +- name: dependencies + description: Pull requests that update a dependency file + color: 0366d6 +- name: documentation + description: Improvements or additions to documentation + color: 0075ca +- name: duplicate + description: This issue or pull request already exists + color: cfd3d7 +- name: enhancement + description: New feature or request + color: a2eeef +- name: github_actions + description: Pull requests that update Github_actions code + color: "000000" +- name: good first issue + description: Good for newcomers + color: 7057ff +- name: help wanted + description: Extra attention is needed + color: 008672 +- name: invalid + description: This doesn't seem right + color: e4e669 +- name: performance + description: Performance + color: "016175" +- name: python + description: Pull requests that update Python code + color: 2b67c6 +- name: question + description: Further information is requested + color: d876e3 +- name: refactoring + description: Refactoring + color: ef67c4 +- name: removal + description: Removals and Deprecations + color: 9ae7ea +- name: style + description: Style + color: c120e5 +- name: testing + description: Testing + color: b1fc6f +- name: wontfix + description: This will not be worked on + color: ffffff diff --git a/.github/release-drafter.yml b/.github/release-drafter.yml new file mode 100644 index 0000000..a9943a6 --- /dev/null +++ b/.github/release-drafter.yml @@ -0,0 +1,28 @@ +categories: + - title: ":boom: Breaking Changes" + label: "breaking" + - title: ":rocket: Features" + label: "enhancement" + - title: ":fire: Removals and Deprecations" + label: "removal" + - title: ":beetle: Fixes" + label: "bug" + - title: ":racehorse: Performance" + label: "performance" + - title: ":rotating_light: Testing" + label: "testing" + - title: ":construction_worker: Continuous Integration" + label: "ci" + - title: ":books: Documentation" + label: "documentation" + - title: ":hammer: Refactoring" + label: "refactoring" + - title: ":lipstick: Style" + label: "style" + - title: ":package: Dependencies" + labels: + - "dependencies" + - "build" +template: | + ## Changes + $CHANGES diff --git a/.github/workflows/labeler.yml b/.github/workflows/labeler.yml new file mode 100644 index 0000000..38e2023 --- /dev/null +++ b/.github/workflows/labeler.yml @@ -0,0 +1,18 @@ +name: Labeler + +on: + push: + branches: + - main + +jobs: + labeler: + runs-on: ubuntu-latest + steps: + - name: Check out the repository + uses: actions/checkout@v3 + + - name: Run Labeler + uses: crazy-max/ghaction-github-labeler@v4.1.0 + with: + skip-delete: true diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml new file mode 100644 index 0000000..95afd55 --- /dev/null +++ b/.github/workflows/release.yml @@ -0,0 +1,78 @@ +name: Release + +on: + push: + branches: + - main + +jobs: + release: + name: Release + runs-on: ubuntu-latest + steps: + - name: Check out the repository + uses: actions/checkout@v3 + with: + fetch-depth: 2 + + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: "3.10" + + - name: Upgrade pip + run: | + pip install --constraint=.github/workflows/constraints.txt pip + pip --version + + - name: Install Poetry + run: | + pip install --constraint=.github/workflows/constraints.txt poetry + poetry --version + + - name: Check if there is a parent commit + id: check-parent-commit + run: | + echo "::set-output name=sha::$(git rev-parse --verify --quiet HEAD^)" + + - name: Detect and tag new version + id: check-version + if: steps.check-parent-commit.outputs.sha + uses: salsify/action-detect-and-tag-new-version@v2.0.3 + with: + version-command: | + bash -o pipefail -c "poetry version | awk '{ print \$2 }'" + + - name: Bump version for developmental release + if: "! steps.check-version.outputs.tag" + run: | + poetry version patch && + version=$(poetry version | awk '{ print $2 }') && + poetry version $version.dev.$(date +%s) + + - name: Build package + run: | + poetry build --ansi + + # - name: Publish package on PyPI + # if: steps.check-version.outputs.tag + # uses: pypa/gh-action-pypi-publish@v1.8.8 + # with: + # user: __token__ + # password: ${{ secrets.PYPI_TOKEN }} + + # - name: Publish package on TestPyPI + # if: "! steps.check-version.outputs.tag" + # uses: pypa/gh-action-pypi-publish@v1.8.8 + # with: + # user: __token__ + # password: ${{ secrets.TEST_PYPI_TOKEN }} + # repository_url: https://test.pypi.org/legacy/ + + - name: Publish the release notes + uses: release-drafter/release-drafter@v5.24.0 + with: + publish: ${{ steps.check-version.outputs.tag != '' }} + tag: ${{ steps.check-version.outputs.tag }} + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} diff --git a/.gitignore b/.gitignore index b6e4761..ff01cc8 100644 --- a/.gitignore +++ b/.gitignore @@ -127,3 +127,6 @@ dmypy.json # Pyre type checker .pyre/ + +# My additions +tests/data diff --git a/src/cesm_helper_scripts/gen_agg b/src/cesm_helper_scripts/gen_agg index 7ea426b..421a6b8 100755 --- a/src/cesm_helper_scripts/gen_agg +++ b/src/cesm_helper_scripts/gen_agg @@ -55,8 +55,20 @@ parser.add_argument( default=["TREFHT"], help="List of attributes of netCDF file", ) +parser.add_argument( + "--append-to", + type=str, + default="", + help="An output file that should be appended to with the input data files.", +) args = parser.parse_args() +if args.append_to != "": + sys.exit( + "Sorry, but an appending method will not be implemented in the near future." + " Instead you can use the ncrcat function from the NCO family" + " (http://nco.sf.net/nco.html#ncrcat)." + ) # Correct the input argument if args.input is None: raise ValueError("you must give the input files") @@ -74,9 +86,11 @@ else: path = "" # Combine the path with all files inputs = [ - f"{path}{file}" - if file.split(".")[-1] == "nc" or "*" in file - else f"{path}{file}.nc" + ( + f"{path}{file}" + if file.split(".")[-1] == "nc" or "*" in file + else f"{path}{file}.nc" + ) for file in args.input ] # If an asterisk (*) is used, all other files are discarded @@ -103,7 +117,7 @@ def _attr_present(attr) -> bool: check_input = glob.glob(the_input)[0] elif isinstance(the_input, list): check_input = the_input[0] - ds = xr.open_mfdataset(check_input) + ds = xr.open_mfdataset(check_input, lock=False) try: _ = getattr(ds, attr) except AttributeError as e: @@ -111,6 +125,8 @@ def _attr_present(attr) -> bool: return False else: return True + finally: + ds.close() # Correct the savepath argument @@ -135,11 +151,10 @@ if not attrs: sys.exit("All attributes files already exist. Exiting...") print("Creating aggregated dataset... ", end="", flush=True) -dataset = xr.open_mfdataset(the_input) +# See issue https://github.com/pydata/xarray/issues/3961 +dataset = xr.open_mfdataset(the_input, lock=False) dataset = xr.decode_cf(dataset) print("Finished creating aggregated dataset.") -# TREFHT: This is probably the correct one for global temperature, reference height -# temperature. SNOWHLND: land snow volume? for i, a in enumerate(attrs): print( f"{i+1}/{len(attrs)}: Start creating file for attr {a}... ", end="", flush=True @@ -165,14 +180,22 @@ for i, a in enumerate(attrs): parts += 1 continue bulk = ds[(parts - 1) * ten_years : parts * ten_years] - bulk = bulk.assign_attrs( - {"Time span": f"From {bulk.time.data[0]} to {bulk.time.data[-1]}"} + bulk = ds.to_dataset() + bulk.attrs[ + "history" + ] = f"Time span: From {ds.time.data[0]} to {ds.time.data[-1]}" + bulk.to_netcdf( + savepath + a + output[:-3] + f"-{parts}.nc", unlimited_dims="time" ) - bulk.to_netcdf(savepath + a + output[:-3] + f"-{parts}.nc") + bulk.close() parts += 1 else: - ds = ds.assign_attrs( - {"Time span": f"From {ds.time.data[0]} to {ds.time.data[-1]}"} - ) - ds.to_netcdf(savepath + a + output) + ds = ds.to_dataset() + ds.attrs[ + "history" + ] = f"Time span: From {ds.time.data[0]} to {ds.time.data[-1]}" + ds.to_netcdf(savepath + a + output, unlimited_dims="time") print(f"{tabs}Finished creating {a + output}.") + finally: + ds.close() +dataset.close() diff --git a/tests/create_data.py b/tests/create_data.py new file mode 100644 index 0000000..ed2e289 --- /dev/null +++ b/tests/create_data.py @@ -0,0 +1,230 @@ +"""Script that generates netCDF files used to test the modules.""" + +import os +import shutil +from typing import Literal, Optional + +import netCDF4 +import numpy as np + +FileFormat = Literal[ + "NETCDF3_CLASSIC", + "NETCDF4", + "NETCDF4_CLASSIC", + "NETCDF3_64BIT_OFFSET", + "NETCDF3_64BIT_DATA", +] + + +class Dataset: + def __init__(self) -> None: + if ( + here := os.path.basename(abs_path := os.path.abspath(".")) + ) == "cesm-helper-scripts": + self.path = os.path.join("tests", "data") + elif here == "tests": + self.path = "data" + else: + raise OSError( + "You are in the wrong path or something. I tried to find ./tests/data," + f" but could only find absolute path {abs_path}, and that ends in the" + f" {here} directory." + ) + self.clean() + self.file_format_list: list[FileFormat] = [ + "NETCDF3_CLASSIC", + "NETCDF4", + "NETCDF4_CLASSIC", + "NETCDF3_64BIT_OFFSET", + "NETCDF3_64BIT_DATA", + ] + self.format: FileFormat = "NETCDF4_CLASSIC" + self.num_files = 10 + + def set_variables(self) -> None: + self.variables: dict[str, dict] = { + "T": { + "dims": ("time", "lev", "lat", "lon"), + "type": "f4", + "units": "K", + "mdims": 1, + "long_name": "Temperature", + "cell_methods": "time: mean", + }, + "TREFHT": { + "dims": ("time", "lat", "lon"), + "type": "f4", + "units": "K", + "long_name": "Reference height temperature", + "cell_methods": "time: mean", + }, + "FLNT": { + "dims": ("time", "lat", "lon"), + "type": "f4", + "Sampling_Sequence": "rad_lwsw", + "units": "W/m2", + "long_name": "Net longwave flux at top of model", + "cell_methods": "time: mean", + }, + "FSNT": { + "dims": ("time", "lat", "lon"), + "type": "f4", + "Sampling_Sequence": "rad_lwsw", + "units": "W/m2", + "long_name": "Net solar flux at top of model", + "cell_methods": "time: mean", + }, + "AODVISstdn": { + "dims": ("time", "lat", "lon"), + "type": "f4", + "_FillValue": 1e36, + # "missing_value": 1e36, + "long_name": "Stratospheric aerosol optical depth 550 nm, day night", + "cell_methods": "time: mean", + }, + } + + def clean(self) -> None: + """Clean up the data directory for generated data sets.""" + if os.path.exists(self.path): + shutil.rmtree(self.path) + os.makedirs(self.path) + + def make_datasets(self, format: Optional[FileFormat] = None) -> None: + self.format = format or self.format + for i in range(self.num_files): + i_ = "0" * (2 - len(str(i))) + str(i) + file_name = f"simulation.cam.h0.1852-{i_}.nc" + self.set_variables() + self.create_dataset(file_name, i * 30, self.format) + + def create_dataset( + self, file_name: str, time_stamp: int, format: FileFormat + ) -> None: + ds = netCDF4.Dataset( + os.path.join(self.path, file_name), mode="w", format=format + ) + ds.description = "Example description" + ds.creator = ( + "Example creator. Here, we make the line so long that it has to wrap." + " Notice that the line in this case will start on the next line, as opposed" + " to the 'description' variable above. Newlines inside the description is" + " indented with four spaces, while lines that have been wrapped are" + " indented with eight spaces. This is also printed with a dim colour where" + " the variable name is. Can you see it?\n It is not so easy to see, but" + " that is also the point, since it does not really provide any useful" + " information; you only need to know about it and then it should be" + " unobtrusive othervise. At this point I dont have anything more to say, I" + " am just making sure the line is long enough to get some wrapping." + ) + ds.title = "" + ds.time_period_freq = "month_1" + ds.createDimension("lat", 96) # latitude axis + ds.createDimension("lon", 144) # longitude axis + ds.createDimension("lev", 70) # level axis + ds.createDimension("ilev", 71) # interfaces levels + ds.createDimension("chars", 8) # KeyError: No variable information + ds.createDimension("nbnd", 2) # KeyError: No variable information + ds.createDimension("time") # unlimited axis (can be appended to). + lat = ds.createVariable("lat", "f8", ("lat",), fill_value=-900) + lat.units = "degrees_north" + lat.long_name = "latitude" + lon = ds.createVariable("lon", float, ("lon",), fill_value=-900) + lon.units = "degrees_east" + lon.long_name = "longitude" + lev = ds.createVariable("lev", float, ("lev",)) + lev.units = "hPa" + lev.positive = "down" + lev.long_name = "hybrid level at midpoints (1000*(A+B))" + lev.standard_name = "atmosphere_hybrid_sigma_pressure_coordinate" + lev.formula_terms = "a: hyam b: hybm p0: P0 ps: PS" + ilev = ds.createVariable("ilev", float, ("ilev",)) + ilev.units = "hPa" + ilev.positive = "down" + ilev.long_name = "hybrid level at interfaces (1000*(A+B))" + ilev.standard_name = "atmosphere_hybrid_sigma_pressure_coordinate" + ilev.formula_terms = "a: hyai b: hybi p0: P0 ps: PS" + time = ds.createVariable("time", float, ("time",)) + time.units = "days since 1850-01-01 00:00:00" + time.long_name = "time" + time.calendar = "noleap" + time.bounds = "time_bnds" + + nlats = ds.dimensions["lat"].size + nlons = ds.dimensions["lon"].size + nlevs = ds.dimensions["lev"].size + # Populate variables with data + ds.variables["lat"] + ds.variables["lon"] + time_ = ds.variables["time"] + time_[:] = time_stamp # Days since 1850 + lat[:] = -90.0 + (180.0 / nlats) * np.arange(nlats) + lon[:] = (180.0 / nlats) * np.arange(nlons) # Greenwich meridian eastward + # fmt: off + ilev[:] = [ + 4.500500e-06, 7.420100e-06, 1.223370e-05, 2.017000e-05, 3.325450e-05, + 5.482750e-05, 9.039800e-05, 1.490400e-04, 2.457200e-04, 4.051250e-04, + 6.679400e-04, 1.101265e-03, 1.815650e-03, 2.993500e-03, 4.963000e-03, + 8.150651e-03, 1.347700e-02, 2.231900e-02, 3.679650e-02, 6.066500e-02, + 9.915650e-02, 1.573900e-01, 2.388500e-01, 3.452000e-01, 4.751350e-01, + 6.318050e-01, 8.291550e-01, 1.082740e+00, 1.406850e+00, 1.818850e+00, + 2.339800e+00, 2.995050e+00, 3.814700e+00, 4.834450e+00, 6.096350e+00, + 7.649350e+00, 9.550100e+00, 1.186400e+01, 1.466550e+01, 1.803800e+01, + 2.207550e+01, 2.688250e+01, 3.257350e+01, 3.927300e+01, 4.711450e+01, + 5.624050e+01, 6.680050e+01, 8.070142e+01, 9.494104e+01, 1.116932e+02, + 1.314013e+02, 1.545868e+02, 1.818634e+02, 2.139528e+02, 2.517044e+02, + 2.961172e+02, 3.483666e+02, 4.098352e+02, 4.821499e+02, 5.672244e+02, + 6.523330e+02, 7.304459e+02, 7.963631e+02, 8.453537e+02, 8.737159e+02, + 9.003246e+02, 9.249645e+02, 9.474323e+02, 9.675386e+02, 9.851122e+02, + 1.000000e+03, + ] + lev[:] = [ + 5.960300e-06, 9.826900e-06, 1.620185e-05, 2.671225e-05, 4.404100e-05, + 7.261275e-05, 1.197190e-04, 1.973800e-04, 3.254225e-04, 5.365325e-04, + 8.846025e-04, 1.458457e-03, 2.404575e-03, 3.978250e-03, 6.556826e-03, + 1.081383e-02, 1.789800e-02, 2.955775e-02, 4.873075e-02, 7.991075e-02, + 1.282732e-01, 1.981200e-01, 2.920250e-01, 4.101675e-01, 5.534700e-01, + 7.304800e-01, 9.559475e-01, 1.244795e+00, 1.612850e+00, 2.079325e+00, + 2.667425e+00, 3.404875e+00, 4.324575e+00, 5.465400e+00, 6.872850e+00, + 8.599725e+00, 1.070705e+01, 1.326475e+01, 1.635175e+01, 2.005675e+01, + 2.447900e+01, 2.972800e+01, 3.592325e+01, 4.319375e+01, 5.167750e+01, + 6.152050e+01, 7.375096e+01, 8.782123e+01, 1.033171e+02, 1.215472e+02, + 1.429940e+02, 1.682251e+02, 1.979081e+02, 2.328286e+02, 2.739108e+02, + 3.222419e+02, 3.791009e+02, 4.459926e+02, 5.246872e+02, 6.097787e+02, + 6.913894e+02, 7.634045e+02, 8.208584e+02, 8.595348e+02, 8.870202e+02, + 9.126445e+02, 9.361984e+02, 9.574855e+02, 9.763254e+02, 9.925561e+02 + ] + # fmt: on + for var_name, var_dict in self.variables.items(): + if "_FillValue" in var_dict: + dims_length = len(dims_ := var_dict.pop("dims")) + var = ds.createVariable( + var_name, + var_dict.pop("type"), + dims_, + fill_value=var_dict.pop("_FillValue"), + ) + else: + dims_length = len(dims_ := var_dict.pop("dims")) + var = ds.createVariable(var_name, var_dict.pop("type"), dims_) + for meta in var_dict.items(): + setattr(var, meta[0], meta[1]) + data_slice = np.random.uniform(low=280, high=330, size=(nlats, nlons)) + data_slice = data_slice[np.newaxis, :] + if dims_length == 3: + var[:, :, :] = np.asarray(data_slice) * ( + np.random.randint(0, 2) * 2 - 1 + ) + elif dims_length == 4: + for levs in range(nlevs): + var[:, levs, :, :] = np.asarray(data_slice) * levs + ds.close() + + +def main(): + creator = Dataset() + creator.make_datasets() + + +if __name__ == "__main__": + main() diff --git a/tests/run_gen_agg.py b/tests/run_gen_agg.py new file mode 100644 index 0000000..26472b8 --- /dev/null +++ b/tests/run_gen_agg.py @@ -0,0 +1,78 @@ +"""Run the `gen_agg` script on the test data.""" + +import os +import subprocess +import time + +import create_data as cd +import numpy as np + + +class RunGenAgg: + def __init__(self) -> None: + here = os.path.basename(abs_path := os.path.abspath(".")) + if here == "cesm-helper-scripts": + self.data_path = os.path.join("tests", "data") + script_path = os.path.join("src", "cesm_helper_scripts") + elif here == "tests": + self.data_path = "data" + script_path = os.path.join("..", "src", "cesm_helper_scripts") + else: + raise OSError( + "You are in the wrong path or something. I tried to find ./tests/data," + f" but could only find absolute path {abs_path}, and that ends in the" + f" {here} directory." + ) + self.script = os.path.join(script_path, "gen_agg") + + def get_file_list(self) -> list[str]: + return os.listdir(self.data_path) + + def simulate(self, splits: int = 1) -> None: + """Simulate generation of aggregated variables. + + Parameters + ---------- + splits : int + Specify the number of files the aggregated data should be split into. + """ + data = self.get_file_list() + data.sort() + steps = int(np.ceil(len(data) / splits)) + file_list: list[list[str]] = [ + data[x : x + steps] for x in range(0, len(data), steps) + ] + for i, chunk in enumerate(file_list): + if return_code := subprocess.call( + [ + "python", + self.script, + "-a", + "FLNT", + "-p", + self.data_path, + "-i", + *chunk, + "-o", + f"FLNT_{i}", + ] + ): + print(f"Return code: {return_code}") + + +def main() -> None: + creator = cd.Dataset() + file_format = creator.file_format_list.copy() + while file_format: + file_format_ = file_format.pop() + print(file_format_) + creator.clean() + creator.make_datasets(file_format_) + s = RunGenAgg() + s.simulate(3) + print(f"Success! {file_format_} files works.") + time.sleep(1.5) + + +if __name__ == "__main__": + main() diff --git a/tests/run_gen_agg.sh b/tests/run_gen_agg.sh new file mode 100644 index 0000000..6401dd0 --- /dev/null +++ b/tests/run_gen_agg.sh @@ -0,0 +1,3 @@ +#!/bin/bash + +DATA_DIR=