Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adding step detection test #889

Merged
merged 3 commits into from
Dec 11, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
139 changes: 139 additions & 0 deletions act/qc/qctests.py
Original file line number Diff line number Diff line change
Expand Up @@ -1629,3 +1629,142 @@ def add_atmospheric_pressure_test(
)

return result

def add_step_change_test(
self,
var_name,
k=1.0,
detrend=True,
n_flagged=2,
add_nan=False,
test_meaning=None,
test_assessment='Indeterminate',
test_number=None,
flag_value=False,
prepend_text=None,
):
"""
Method to detect a shift change in values using the CUSUM (cumulative sum control chart) test.

Parameters
----------
var_name : str
Data variable name in Dataset to use for testing. Results are inserted into
accompanying embedded quality control variable.
k : float
Reference value. This is typically around the value of the shift size change to
to be detected when detrend=True. Will typically be around half the value
of the shift size when no detrend is applied.
detrend : bool
Remove the trend in the data by differencing the data before
applying the CUSUM algorithm. Needed for most data that have atmospheric variability.
n_flagged : int
Number of time steps to flag in the quality control variable. Default is to
flag the point of and one after the step change since we will not know which
value of the two will be suspect. Can set to any number of time steps
or -1 to flag the remaining data after the detected step change.
add_nan : bool
Should a NaN value be added to the data where a value is missing. A value is
determined to be missing when the time step is larger than the mode of the
difference in time values. This will stop the reporting of a step when there
is an outage and the data normally rises/decends but with the gap is appears
as a step change.
test_meaning : str
Optional text description to add to flag_meanings
describing the test. Will use a default if not set.
test_assessment : str
Optional single word describing the assessment of the test.
Will use a default if not set.
test_number : int
Optional test number to use. If not set will use next available
test number.
flag_value : boolean
Indicates that the tests are stored as integers
not bit packed values in quality control variable.
prepend_text : str
Optional text to prepend to the test meaning.
Example is indicate what institution added the test.

Returns
-------
test_info : tuple
A tuple containing test information including var_name, qc
variable name, test_number, test_meaning, test_assessment

"""

def cusum(data, k, mean_val, lower=False):
"""
CUSUM algrithm used to detect step changes.
kenkehoe marked this conversation as resolved.
Show resolved Hide resolved

data : numpy array
1D numpy array of time series data to analze
k : float
Reference value. This is typically half the value of the shift size change to
to be detected.
mean_val : float
Mean value of data.
lower : bool
Option to loof for lower shifts vs. upper shifts

"""

C = np.zeros(data.size)
if lower:
for ii in range(1, data.size):
kenkehoe marked this conversation as resolved.
Show resolved Hide resolved
C[ii] = max(0, C[ii - 1] - (data[ii] - mean_val + k))
else:
for ii in range(1, data.size):
C[ii] = max(0, C[ii - 1] + (data[ii] - mean_val - k))

return C

data = self._ds[var_name].values
time = self._ds['time'].values
if add_nan:
from act.utils.data_utils import add_in_nan

time, data = add_in_nan(time, data)

data = data.astype(float)
if detrend:
data = np.diff(data)
data = np.append(np.nan, data)

if n_flagged < 0:
n_flagged = data.size

mean_val = np.nanmean(data)
index = np.full(data.size, False)
for lower in [False, True]:
C = cusum(data, k, mean_val, lower=lower)
found_ind = np.where(np.diff(C) > 0.0)[0]
for ind in found_ind:
ind = np.arange(ind, ind + n_flagged)
ind = ind[ind < data.size]
index[ind] = True

if add_nan:
import xarray as xr

da = xr.DataArray(index, dims=['time'], coords=[time])
result = da.sel(time=self._ds['time'])
index = result.values
del result

if test_meaning is None:
test_meaning = f'Shift in data detected with CUSUM algrithm: k={round(k, 2)}'

if prepend_text is not None:
test_meaning = ': '.join((prepend_text, test_meaning))

result = self._ds.qcfilter.add_test(
var_name,
index=index,
test_number=test_number,
test_meaning=test_meaning,
test_assessment=test_assessment,
flag_value=flag_value,
)

return result
60 changes: 60 additions & 0 deletions examples/qc/plot_qc_step_change.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
"""

This is an example for how to use the step change detection test.
The test uses the cumulative sum control chart to detect when
a sudden shift in values occurs. It has an option to insert
NaN value when there is a data gap to not have those periods
returned as a data shift. This example produces two plots,
one with the data gap flagged and one without.

"""

from matplotlib import pyplot as plt
import numpy as np

from arm_test_data import DATASETS
from act.io.arm import read_arm_netcdf


# Get example data from ARM Test Data repository
EXAMPLE_MET = DATASETS.fetch('sgpmetE13.b1.20190101.000000.cdf')
variable = 'temp_mean'
ds = read_arm_netcdf(EXAMPLE_MET, keep_variables=variable)

# Add shifts in the data
data = ds[variable].values
data[600:] += 2
data[1000:] -= 2
ds[variable].values = data

# Remove data from the Dataset to simulate instrument being off-line
ds = ds.where((ds["time.hour"] < 3) | (ds["time.hour"] > 5), drop=True)

# Add step change test
ds.qcfilter.add_step_change_test(variable)

# Add step change test but insert NaN values during period of missing data
# so it does not trip the test.
ds.qcfilter.add_step_change_test(variable, add_nan=True)

# Make plot with results from the step change test for when the missing data
# is included and a second plot without including the missing data gap.
title = 'Step change detection'
for ii in range(1, 3):
plt.figure(figsize=(10, 6))
plt.plot(ds['time'].values, ds[variable].values, label='Data')
plt.xlabel('Time')
plt.ylabel(f"{ds[variable].attrs['long_name']} ({ds[variable].attrs['units']})")
plt.title(title)
plt.grid(lw=2, ls=':')

label = 'Step change'
index = ds.qcfilter.get_qc_test_mask(var_name=variable, test_number=ii)
for jj in np.where(index)[0]:
plt.axvline(x=ds['time'].values[jj], color='orange', linestyle='--', label=label)
label = None

title += ' with NaN added in data gaps'

plt.legend()
plt.show()
88 changes: 88 additions & 0 deletions tests/qc/test_qctests.py
Original file line number Diff line number Diff line change
Expand Up @@ -417,3 +417,91 @@ def test_add_atmospheric_pressure_test():

ds.close
del ds


def test_add_step_change_test():
variable = 'temp_mean'
qc_variable = f"qc_{variable}"
ds = read_arm_netcdf(EXAMPLE_MET1, keep_variables=['temp_mean', 'atmos_pressure'])
ds.load()

result = ds.qcfilter.add_step_change_test(variable)
assert result == {
'test_number': 1,
'test_meaning': 'Shift in data detected with CUSUM algrithm: k=1.0',
kenkehoe marked this conversation as resolved.
Show resolved Hide resolved
'test_assessment': 'Indeterminate',
'qc_variable_name': qc_variable,
'variable_name': variable,
}
index = ds.qcfilter.get_qc_test_mask(var_name=variable, test_number=1)
assert len(np.where(index)[0]) == 0
assert ds[qc_variable].attrs['flag_meanings'] == [
'Shift in data detected with CUSUM algrithm: k=1.0'
kenkehoe marked this conversation as resolved.
Show resolved Hide resolved
]
assert ds[qc_variable].attrs['flag_assessments'] == ['Indeterminate']

data = ds[variable].values
data[100:] -= 5
data[600:] += 4
data[800:] += 10
data[1000:] -= 2
ds[variable].values = data

ds.qcfilter.add_step_change_test(variable)
index = ds.qcfilter.get_qc_test_mask(var_name=variable, test_number=2)
assert np.all(np.where(index)[0] == [99, 100, 599, 600, 799, 800, 999, 1000])
assert (
ds[qc_variable].attrs['flag_meanings'][1]
== 'Shift in data detected with CUSUM algrithm: k=1.0'
kenkehoe marked this conversation as resolved.
Show resolved Hide resolved
)
assert ds[qc_variable].attrs['flag_assessments'][1] == 'Indeterminate'

ds.qcfilter.add_step_change_test(variable, k=4, prepend_text='ARM')
index = ds.qcfilter.get_qc_test_mask(var_name=variable, test_number=3)
assert np.all(np.where(index)[0] == [99, 100, 599, 600, 799, 800])
assert (
ds[qc_variable].attrs['flag_meanings'][2]
== 'ARM: Shift in data detected with CUSUM algrithm: k=4'
kenkehoe marked this conversation as resolved.
Show resolved Hide resolved
)

ds.qcfilter.add_step_change_test(variable, n_flagged=3)
index = ds.qcfilter.get_qc_test_mask(var_name=variable, test_number=4)
assert np.all(
np.where(index)[0] == [99, 100, 101, 599, 600, 601, 799, 800, 801, 999, 1000, 1001]
)

ds.qcfilter.add_step_change_test(variable, n_flagged=-1, k=5.1, test_assessment='Suspect')
index = ds.qcfilter.get_qc_test_mask(var_name=variable, test_number=5)
assert np.all(np.where(index)[0] == np.arange(799, 1440))
assert (
ds[qc_variable].attrs['flag_meanings'][4]
== 'Shift in data detected with CUSUM algrithm: k=5.1'
kenkehoe marked this conversation as resolved.
Show resolved Hide resolved
)
assert ds[qc_variable].attrs['flag_assessments'][4] == 'Suspect'

variable = 'atmos_pressure'
ds.qcfilter.add_step_change_test(variable, detrend=False)
index = ds.qcfilter.get_qc_test_mask(var_name=variable, test_number=1)
assert len(np.where(index)[0]) == 0

ds.close
del ds

# Test add_nan keyword
variable = 'temp_mean'
ds = read_arm_netcdf(EXAMPLE_MET1, keep_variables=variable)
data = ds[variable].values
data[600:] += 2
ds[variable].values = data

ds = ds.where((ds["time.hour"] < 3) | (ds["time.hour"] > 5), drop=True)

ds.qcfilter.add_step_change_test(variable, add_nan=False)
index = ds.qcfilter.get_qc_test_mask(var_name=variable, test_number=1)
assert np.all(np.where(index)[0] == [179, 180, 419, 420])

ds.qcfilter.add_step_change_test(variable, add_nan=True)
index = ds.qcfilter.get_qc_test_mask(var_name=variable, test_number=2)
assert np.all(np.where(index)[0] == [419, 420])

del ds
Loading