From c20753bca048b2e453b3f28c6c0061eefd5948ed Mon Sep 17 00:00:00 2001
From: Adam Theisen <atheisen@anl.gov>
Date: Mon, 12 Aug 2024 12:46:49 -0500
Subject: [PATCH] =?UTF-8?q?ADD:=20Adding=20new=20function=20to=20validate?=
 =?UTF-8?q?=20that=20a=20file=20or=20DataSet=20meet=20ARM=20=E2=80=A6=20(#?=
 =?UTF-8?q?849)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* ADD: Adding new function to validate that a file or DataSet meet ARM standards

* ENH: Bug fixes for testing

* ENH: Updating checker for 2d variables

* ENH: Update per Zachs request
---
 act/utils/__init__.py        |   1 +
 act/utils/io_utils.py        | 201 ++++++++++++++++++++++++++++++++---
 tests/utils/test_io_utils.py |  53 +++++++++
 3 files changed, 243 insertions(+), 12 deletions(-)

diff --git a/act/utils/__init__.py b/act/utils/__init__.py
index 2f31e9755a..6e72622d50 100644
--- a/act/utils/__init__.py
+++ b/act/utils/__init__.py
@@ -63,6 +63,7 @@
             'pack_gzip',
             'unpack_gzip',
             'generate_movie',
+            'arm_standards_validator',
         ],
     },
 )
diff --git a/act/utils/io_utils.py b/act/utils/io_utils.py
index 9f67cc8f75..236f1beccb 100644
--- a/act/utils/io_utils.py
+++ b/act/utils/io_utils.py
@@ -1,5 +1,9 @@
+import act
+import numpy as np
+import pandas as pd
 from pathlib import Path
 import tarfile
+from os import sep
 from os import PathLike
 from shutil import rmtree
 import gzip
@@ -21,8 +25,6 @@ def pack_tar(filenames, write_filename=None, write_directory=None, remove=False)
     Creates TAR file from list of filenames provided. Currently only works with
     all files existing in the same directory.
 
-    ...
-
     Parameters
     ----------
     filenames : str or list
@@ -81,8 +83,6 @@ def unpack_tar(
     """
     Unpacks TAR file contents into provided base directory
 
-    ...
-
     Parameters
     ----------
     tar_files : str or list
@@ -155,8 +155,6 @@ def cleanup_files(dirname=None, files=None):
     """
     Cleans up files and directory possibly created from unpacking TAR files with unpack_tar()
 
-    ...
-
     Parameters
     ----------
     dirname : str, pathlib.Path, None
@@ -209,8 +207,6 @@ def pack_gzip(filename, write_directory=None, remove=False):
     """
     Creates a gunzip file from a filename path
 
-    ...
-
     Parameters
     ----------
     filename : str, pathlib.Path
@@ -249,8 +245,6 @@ def unpack_gzip(filename, write_directory=None, remove=False):
     """
     Extracts file from a gunzip file.
 
-    ...
-
     Parameters
     ----------
     filename : str, pathlib.Path
@@ -290,8 +284,6 @@ def generate_movie(images, write_filename=None, fps=10, **kwargs):
     """
     Creates a movie from a list of images or convert movie to different type
 
-    ...
-
     Parameters
     ----------
     images : list, PosixPath generator, path to a directory, single string/PosixPath to movie
@@ -376,3 +368,188 @@ def generate_movie(images, write_filename=None, fps=10, **kwargs):
         clip.write_videofile(str(write_filename), **kwargs)
 
     return str(write_filename)
+
+
+def arm_standards_validator(file=None, dataset=None, verbose=True):
+    """
+    ARM Data Validator (ADV) - Checks to ensure that ARM standards are being followed
+    in the files or dataset passed to it.  Note, this includes a minimal set of
+    standards that it checks against
+
+    Parameters
+    ----------
+    file : str
+        Filename to check against ARM standards.  Do not pass in both a file and dataset
+    dataset : xarray.DataSet
+        Xarray dataset of an already read in file.
+    verbose : boolean
+        Defaults to print out errors in addition to returning a list of them
+
+    Returns
+    -------
+    err : list
+        List of errors in the data
+
+    """
+
+    # Set up the error tracking list
+    err = []
+    if file is not None and isinstance(file, str):
+        # Check file naming standards
+        if len(file.split(sep)[-1]) > 60.0:
+            err.append('Filename length exceeds 60 characters')
+        try:
+            f_obj = act.utils.data_utils.DatastreamParserARM(file)
+        except Exception as e:
+            print(e)
+
+        if (
+            (f_obj.site is None)
+            or (f_obj.datastream_class is None)
+            or (f_obj.level is None)
+            or (f_obj.facility is None)
+            or (f_obj.date is None)
+            or (f_obj.time is None)
+            or (f_obj.ext is None)
+        ):
+            err.append(
+                'Filename does not follow the normal ARM convention: '
+                + '(sss)(inst)(qualifier)(temporal)(Fn).(dl).(yyyymmdd).(hhmmss).nc'
+            )
+        else:
+            if f_obj.level[0] not in ['0', 'a', 'b', 'c', 's', 'm']:
+                err.append(f_obj.level + ' is not a standard ARM data level')
+
+        results = act.utils.arm_site_location_search(
+            site_code=f_obj.site, facility_code=f_obj.facility
+        )
+        if len(results) == 0:
+            err.append('Site and facility are not ARM standard')
+
+    # The ability to read a file from NetCDF to xarray will catch a lot of the
+    # problems with formatting.  This would leave standard ARM checks
+    try:
+        if dataset is None and file is not None:
+            ds = act.io.read_arm_netcdf(file)
+        elif dataset is not None:
+            ds = dataset
+        else:
+            raise ValueError('File and dataset are both None')
+    except Exception as e:
+        return ['File is not in a standard format that is readable by xarray: ' + str(e)]
+
+    # Review time variables for errors for conformance to standards
+    if 'time' not in list(ds.dims)[0]:
+        err.append('"time" is required to be the first dimension.')
+
+    for c in list(ds.coords):
+        if c not in ds.dims:
+            err.append(c + ': Coordinate is not included in dimensions.')
+
+    if any(np.isnan(ds['time'].values)):
+        err.append('Time must not include NaNs.')
+
+    duplicates = sum(ds['time'].to_pandas().duplicated())
+    if duplicates > 0:
+        err.append('Duplicate times present in the file')
+
+    diff = ds['time'].diff('time')
+    idx = np.where(diff <= pd.Timedelta(0))
+    if len(idx[0]) > 0:
+        err.append('Time is not in increasing order')
+
+    if 'base_time' not in ds or 'time_offset' not in ds:
+        err.append('ARM requires base_time and time_offset variables.')
+
+    # Check to make sure other coordinate variables don't have nans
+    # Also check to make sure coordinate variables are not decreasing
+    if len(list(ds.coords)) > 1:
+        for d in ds.coords:
+            if d == 'time':
+                continue
+            if any(np.isnan(ds[d].values)):
+                err.append('Coordinates must not include NaNs ' + d)
+
+            diff = ds[d].diff(d)
+            idx = np.where(diff <= 0.0)
+            if len(idx[0]) > 0:
+                err.append(d + ' is not in increasing order')
+            if 'missing_value' in ds[d].encoding:
+                err.append(d + ' should not include missing value')
+
+    # Verify that each variable has a long_name and units attributes
+    for v in ds:
+        if (len(ds[v].dims) > 0) and ('time' not in list(ds[v].dims)[0]) and ('bounds' not in v):
+            err.append(v + ': "time" is required to be the first dimension.')
+        if (ds[v].size == 1) and (len(ds[v].dims) > 0):
+            err.append(v + ': is not defined as a scalar.')
+        if 'long_name' not in ds[v].attrs:
+            err.append('Required attribute long_name not in ' + v)
+        else:
+            if not ds[v].attrs['long_name'][0].isupper():
+                err.append(v + ' long_name attribute does not start with uppercase')
+
+        if (
+            ('qc_' not in v)
+            and (v not in ['time', 'time_offset', 'base_time', 'lat', 'lon', 'alt'])
+            and ('bounds' not in v)
+        ):
+            if ('missing_value' not in ds[v].encoding) and ('FillValue' not in ds[v].encoding):
+                err.append(v + ' does not include missing_value or FillValue attributes')
+
+        # QC variable checks
+        if 'qc_' in v:
+            if v[3:] not in ds:
+                err.append('QC variable does not have a corresponding variable ' + v[3:])
+            if 'ancillary_variables' not in ds[v[3:]].attrs:
+                err.append(
+                    v[3:] + ' does not include ancillary_variable attribute pointing to ' + v
+                )
+            if 'description' not in ds[v].attrs:
+                err.append(v + ' does not include description attribute')
+            if 'flag_method' not in ds[v].attrs:
+                err.append(v + ' does not include flag_method attribute')
+
+        if (v not in ['base_time', 'time_offset']) and ('bounds' not in v):
+            if 'units' not in ds[v].attrs:
+                err.append('Required attribute units not in ' + v)
+
+    # Lat/Lon/Alt Checks
+    if 'lat' not in ds:
+        err.append('ARM requires the latitude variable to be named lat')
+    else:
+        if 'standard_name' in ds['lat'].attrs:
+            if ds['lat'].attrs['standard_name'] != 'latitude':
+                err.append('ARM requires the lat standard_name to be latitude')
+        else:
+            err.append('"lat" variable does not have a standard_name attribute')
+    if 'lon' not in ds:
+        err.append('ARM requires the longitude variable to be named lon')
+    else:
+        if 'standard_name' in ds['lon'].attrs:
+            if ds['lon'].attrs['standard_name'] != 'longitude':
+                err.append('ARM requires the lon standard_name to be longitude')
+        else:
+            err.append('"long" variable does not have a standard_name attribute')
+    if 'alt' not in ds:
+        err.append('ARM requires the altitude variable to be named alt')
+    else:
+        if 'standard_name' in ds['alt'].attrs:
+            if ds['alt'].attrs['standard_name'] != 'altitude':
+                err.append('ARM requires the alt standard_name to be altitude')
+        else:
+            err.append('"alt" variable does not have a standard_name attribute')
+
+    # Required global attributes
+    req_att = ['doi', 'sampling_interval', 'averaging_interval']
+    for ra in req_att:
+        if ra not in ds.attrs:
+            err.append('Global attribute is missing: ' + ra)
+
+    if verbose:
+        if len(err) > 0:
+            [print(e) for e in err]
+        else:
+            print('File is passing standards checks')
+
+    return err
diff --git a/tests/utils/test_io_utils.py b/tests/utils/test_io_utils.py
index 4d7263cd74..6bde1635c0 100644
--- a/tests/utils/test_io_utils.py
+++ b/tests/utils/test_io_utils.py
@@ -285,3 +285,56 @@ def test_generate_movie():
 
         finally:
             chdir(cwd)
+
+
+def test_arm_standards_validator():
+    met_files = sample_files.EXAMPLE_MET_SAIL
+    errors = act.utils.arm_standards_validator(met_files)
+    assert len(errors) == 0
+
+    ds = act.io.read_arm_netcdf(met_files)
+    ds2 = ds.drop_vars(['lat', 'lon', 'alt'])
+    errors = act.utils.arm_standards_validator(dataset=ds2)
+
+    assert len(errors) == 3
+
+    ds2 = ds
+    var = ['lat', 'lon', 'alt']
+    for v in var:
+        del ds2[v].attrs['standard_name']
+
+    errors = act.utils.arm_standards_validator(dataset=ds2)
+    assert len(errors) == 3
+
+    ds2 = ds
+    for v in var:
+        ds2[v].attrs['standard_name'] = 'test'
+    errors = act.utils.arm_standards_validator(dataset=ds2)
+    assert len(errors) == 3
+
+    ds2 = ds
+    for v in ds2:
+        del ds2[v].attrs['long_name']
+    errors = act.utils.arm_standards_validator(dataset=ds2)
+    assert len(errors) == 54
+
+    ds2 = act.io.read_arm_netcdf(met_files)
+    ds2['time'].values[1] = ds2['time'].values[10]
+    errors = act.utils.arm_standards_validator(dataset=ds2)
+
+    assert 'Duplicate' in errors[0]
+    assert 'increasing' in errors[1]
+
+    file = 'shpinstrumentX50.z1.202005.000000.nc'
+    errors = act.utils.arm_standards_validator(file)
+
+    assert 'File is not in a standard format' in errors[0]
+
+    file = 'sgpmetE13.z1.20200501.000000.nc'
+    errors = act.utils.arm_standards_validator(file)
+
+    assert 'no files to open' in errors[0]
+
+    ds = act.io.read_arm_netcdf(sample_files.EXAMPLE_CEIL1)
+    errors = act.utils.arm_standards_validator(dataset=ds)
+    assert len(errors) == 4