From c20753bca048b2e453b3f28c6c0061eefd5948ed Mon Sep 17 00:00:00 2001 From: Adam Theisen Date: Mon, 12 Aug 2024 12:46:49 -0500 Subject: [PATCH] =?UTF-8?q?ADD:=20Adding=20new=20function=20to=20validate?= =?UTF-8?q?=20that=20a=20file=20or=20DataSet=20meet=20ARM=20=E2=80=A6=20(#?= =?UTF-8?q?849)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * ADD: Adding new function to validate that a file or DataSet meet ARM standards * ENH: Bug fixes for testing * ENH: Updating checker for 2d variables * ENH: Update per Zachs request --- act/utils/__init__.py | 1 + act/utils/io_utils.py | 201 ++++++++++++++++++++++++++++++++--- tests/utils/test_io_utils.py | 53 +++++++++ 3 files changed, 243 insertions(+), 12 deletions(-) diff --git a/act/utils/__init__.py b/act/utils/__init__.py index 2f31e9755a..6e72622d50 100644 --- a/act/utils/__init__.py +++ b/act/utils/__init__.py @@ -63,6 +63,7 @@ 'pack_gzip', 'unpack_gzip', 'generate_movie', + 'arm_standards_validator', ], }, ) diff --git a/act/utils/io_utils.py b/act/utils/io_utils.py index 9f67cc8f75..236f1beccb 100644 --- a/act/utils/io_utils.py +++ b/act/utils/io_utils.py @@ -1,5 +1,9 @@ +import act +import numpy as np +import pandas as pd from pathlib import Path import tarfile +from os import sep from os import PathLike from shutil import rmtree import gzip @@ -21,8 +25,6 @@ def pack_tar(filenames, write_filename=None, write_directory=None, remove=False) Creates TAR file from list of filenames provided. Currently only works with all files existing in the same directory. - ... - Parameters ---------- filenames : str or list @@ -81,8 +83,6 @@ def unpack_tar( """ Unpacks TAR file contents into provided base directory - ... - Parameters ---------- tar_files : str or list @@ -155,8 +155,6 @@ def cleanup_files(dirname=None, files=None): """ Cleans up files and directory possibly created from unpacking TAR files with unpack_tar() - ... - Parameters ---------- dirname : str, pathlib.Path, None @@ -209,8 +207,6 @@ def pack_gzip(filename, write_directory=None, remove=False): """ Creates a gunzip file from a filename path - ... - Parameters ---------- filename : str, pathlib.Path @@ -249,8 +245,6 @@ def unpack_gzip(filename, write_directory=None, remove=False): """ Extracts file from a gunzip file. - ... - Parameters ---------- filename : str, pathlib.Path @@ -290,8 +284,6 @@ def generate_movie(images, write_filename=None, fps=10, **kwargs): """ Creates a movie from a list of images or convert movie to different type - ... - Parameters ---------- images : list, PosixPath generator, path to a directory, single string/PosixPath to movie @@ -376,3 +368,188 @@ def generate_movie(images, write_filename=None, fps=10, **kwargs): clip.write_videofile(str(write_filename), **kwargs) return str(write_filename) + + +def arm_standards_validator(file=None, dataset=None, verbose=True): + """ + ARM Data Validator (ADV) - Checks to ensure that ARM standards are being followed + in the files or dataset passed to it. Note, this includes a minimal set of + standards that it checks against + + Parameters + ---------- + file : str + Filename to check against ARM standards. Do not pass in both a file and dataset + dataset : xarray.DataSet + Xarray dataset of an already read in file. + verbose : boolean + Defaults to print out errors in addition to returning a list of them + + Returns + ------- + err : list + List of errors in the data + + """ + + # Set up the error tracking list + err = [] + if file is not None and isinstance(file, str): + # Check file naming standards + if len(file.split(sep)[-1]) > 60.0: + err.append('Filename length exceeds 60 characters') + try: + f_obj = act.utils.data_utils.DatastreamParserARM(file) + except Exception as e: + print(e) + + if ( + (f_obj.site is None) + or (f_obj.datastream_class is None) + or (f_obj.level is None) + or (f_obj.facility is None) + or (f_obj.date is None) + or (f_obj.time is None) + or (f_obj.ext is None) + ): + err.append( + 'Filename does not follow the normal ARM convention: ' + + '(sss)(inst)(qualifier)(temporal)(Fn).(dl).(yyyymmdd).(hhmmss).nc' + ) + else: + if f_obj.level[0] not in ['0', 'a', 'b', 'c', 's', 'm']: + err.append(f_obj.level + ' is not a standard ARM data level') + + results = act.utils.arm_site_location_search( + site_code=f_obj.site, facility_code=f_obj.facility + ) + if len(results) == 0: + err.append('Site and facility are not ARM standard') + + # The ability to read a file from NetCDF to xarray will catch a lot of the + # problems with formatting. This would leave standard ARM checks + try: + if dataset is None and file is not None: + ds = act.io.read_arm_netcdf(file) + elif dataset is not None: + ds = dataset + else: + raise ValueError('File and dataset are both None') + except Exception as e: + return ['File is not in a standard format that is readable by xarray: ' + str(e)] + + # Review time variables for errors for conformance to standards + if 'time' not in list(ds.dims)[0]: + err.append('"time" is required to be the first dimension.') + + for c in list(ds.coords): + if c not in ds.dims: + err.append(c + ': Coordinate is not included in dimensions.') + + if any(np.isnan(ds['time'].values)): + err.append('Time must not include NaNs.') + + duplicates = sum(ds['time'].to_pandas().duplicated()) + if duplicates > 0: + err.append('Duplicate times present in the file') + + diff = ds['time'].diff('time') + idx = np.where(diff <= pd.Timedelta(0)) + if len(idx[0]) > 0: + err.append('Time is not in increasing order') + + if 'base_time' not in ds or 'time_offset' not in ds: + err.append('ARM requires base_time and time_offset variables.') + + # Check to make sure other coordinate variables don't have nans + # Also check to make sure coordinate variables are not decreasing + if len(list(ds.coords)) > 1: + for d in ds.coords: + if d == 'time': + continue + if any(np.isnan(ds[d].values)): + err.append('Coordinates must not include NaNs ' + d) + + diff = ds[d].diff(d) + idx = np.where(diff <= 0.0) + if len(idx[0]) > 0: + err.append(d + ' is not in increasing order') + if 'missing_value' in ds[d].encoding: + err.append(d + ' should not include missing value') + + # Verify that each variable has a long_name and units attributes + for v in ds: + if (len(ds[v].dims) > 0) and ('time' not in list(ds[v].dims)[0]) and ('bounds' not in v): + err.append(v + ': "time" is required to be the first dimension.') + if (ds[v].size == 1) and (len(ds[v].dims) > 0): + err.append(v + ': is not defined as a scalar.') + if 'long_name' not in ds[v].attrs: + err.append('Required attribute long_name not in ' + v) + else: + if not ds[v].attrs['long_name'][0].isupper(): + err.append(v + ' long_name attribute does not start with uppercase') + + if ( + ('qc_' not in v) + and (v not in ['time', 'time_offset', 'base_time', 'lat', 'lon', 'alt']) + and ('bounds' not in v) + ): + if ('missing_value' not in ds[v].encoding) and ('FillValue' not in ds[v].encoding): + err.append(v + ' does not include missing_value or FillValue attributes') + + # QC variable checks + if 'qc_' in v: + if v[3:] not in ds: + err.append('QC variable does not have a corresponding variable ' + v[3:]) + if 'ancillary_variables' not in ds[v[3:]].attrs: + err.append( + v[3:] + ' does not include ancillary_variable attribute pointing to ' + v + ) + if 'description' not in ds[v].attrs: + err.append(v + ' does not include description attribute') + if 'flag_method' not in ds[v].attrs: + err.append(v + ' does not include flag_method attribute') + + if (v not in ['base_time', 'time_offset']) and ('bounds' not in v): + if 'units' not in ds[v].attrs: + err.append('Required attribute units not in ' + v) + + # Lat/Lon/Alt Checks + if 'lat' not in ds: + err.append('ARM requires the latitude variable to be named lat') + else: + if 'standard_name' in ds['lat'].attrs: + if ds['lat'].attrs['standard_name'] != 'latitude': + err.append('ARM requires the lat standard_name to be latitude') + else: + err.append('"lat" variable does not have a standard_name attribute') + if 'lon' not in ds: + err.append('ARM requires the longitude variable to be named lon') + else: + if 'standard_name' in ds['lon'].attrs: + if ds['lon'].attrs['standard_name'] != 'longitude': + err.append('ARM requires the lon standard_name to be longitude') + else: + err.append('"long" variable does not have a standard_name attribute') + if 'alt' not in ds: + err.append('ARM requires the altitude variable to be named alt') + else: + if 'standard_name' in ds['alt'].attrs: + if ds['alt'].attrs['standard_name'] != 'altitude': + err.append('ARM requires the alt standard_name to be altitude') + else: + err.append('"alt" variable does not have a standard_name attribute') + + # Required global attributes + req_att = ['doi', 'sampling_interval', 'averaging_interval'] + for ra in req_att: + if ra not in ds.attrs: + err.append('Global attribute is missing: ' + ra) + + if verbose: + if len(err) > 0: + [print(e) for e in err] + else: + print('File is passing standards checks') + + return err diff --git a/tests/utils/test_io_utils.py b/tests/utils/test_io_utils.py index 4d7263cd74..6bde1635c0 100644 --- a/tests/utils/test_io_utils.py +++ b/tests/utils/test_io_utils.py @@ -285,3 +285,56 @@ def test_generate_movie(): finally: chdir(cwd) + + +def test_arm_standards_validator(): + met_files = sample_files.EXAMPLE_MET_SAIL + errors = act.utils.arm_standards_validator(met_files) + assert len(errors) == 0 + + ds = act.io.read_arm_netcdf(met_files) + ds2 = ds.drop_vars(['lat', 'lon', 'alt']) + errors = act.utils.arm_standards_validator(dataset=ds2) + + assert len(errors) == 3 + + ds2 = ds + var = ['lat', 'lon', 'alt'] + for v in var: + del ds2[v].attrs['standard_name'] + + errors = act.utils.arm_standards_validator(dataset=ds2) + assert len(errors) == 3 + + ds2 = ds + for v in var: + ds2[v].attrs['standard_name'] = 'test' + errors = act.utils.arm_standards_validator(dataset=ds2) + assert len(errors) == 3 + + ds2 = ds + for v in ds2: + del ds2[v].attrs['long_name'] + errors = act.utils.arm_standards_validator(dataset=ds2) + assert len(errors) == 54 + + ds2 = act.io.read_arm_netcdf(met_files) + ds2['time'].values[1] = ds2['time'].values[10] + errors = act.utils.arm_standards_validator(dataset=ds2) + + assert 'Duplicate' in errors[0] + assert 'increasing' in errors[1] + + file = 'shpinstrumentX50.z1.202005.000000.nc' + errors = act.utils.arm_standards_validator(file) + + assert 'File is not in a standard format' in errors[0] + + file = 'sgpmetE13.z1.20200501.000000.nc' + errors = act.utils.arm_standards_validator(file) + + assert 'no files to open' in errors[0] + + ds = act.io.read_arm_netcdf(sample_files.EXAMPLE_CEIL1) + errors = act.utils.arm_standards_validator(dataset=ds) + assert len(errors) == 4