Skip to content
This repository has been archived by the owner on Aug 29, 2023. It is now read-only.

Commit

Permalink
Merge pull request #430 from CCI-Tools/xx-jg-more-monitors
Browse files Browse the repository at this point in the history
Add more monitors in ops
  • Loading branch information
JanisGailis authored Sep 27, 2017
2 parents 1830f5d + 3b01860 commit c1155d6
Show file tree
Hide file tree
Showing 6 changed files with 156 additions and 98 deletions.
13 changes: 9 additions & 4 deletions cate/ops/anomaly.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,8 +63,9 @@ def anomaly_external(ds: xr.Dataset,
:param ds: The dataset to calculate anomalies from
:param file: Path to reference data file
:param str: Apply the given transformation before calculating the anomaly.
:param transform: Apply the given transformation before calculating the anomaly.
For supported operations see help on 'ds_arithmetics' operation.
:param monitor: a progress monitor.
:return: The anomaly dataset
"""
# Check if the time coordinate is of dtype datetime
Expand Down Expand Up @@ -126,7 +127,8 @@ def _group_anomaly(group: xr.Dataset,
@op_return(add_history=True)
def anomaly_internal(ds: xr.Dataset,
time_range: TimeRangeLike.TYPE = None,
region: PolygonLike.TYPE = None) -> xr.Dataset:
region: PolygonLike.TYPE = None,
monitor: Monitor = Monitor.NONE) -> xr.Dataset:
"""
Calculate anomaly using as reference data the mean of an optional region
and time slice from the given dataset. If no time slice/spatial region is
Expand All @@ -137,6 +139,7 @@ def anomaly_internal(ds: xr.Dataset,
:param ds: The dataset to calculate anomalies from
:param time_range: Time range to use for reference data
:param region: Spatial region to use for reference data
:param monitor: a progress monitor.
:return: The anomaly dataset
"""
ref = ds.copy()
Expand All @@ -146,5 +149,7 @@ def anomaly_internal(ds: xr.Dataset,
if region:
region = PolygonLike.convert(region)
ref = subset_spatial(ref, region)
ref = ref.mean(keep_attrs=True, skipna=True)
return ds - ref
with monitor.observing("Calculating anomaly"):
ref = ref.mean(keep_attrs=True, skipna=True)
diff = ds - ref
return diff
63 changes: 37 additions & 26 deletions cate/ops/arithmetics.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,13 +34,15 @@

from cate.core.op import op, op_input, op_return
from cate.core.types import DatasetLike
from cate.util.monitor import Monitor


@op(tags=['arithmetic'], version='1.0')
@op_input('ds', data_type=DatasetLike)
@op_return(add_history=True)
def ds_arithmetics(ds: DatasetLike.TYPE,
op: str) -> xr.Dataset:
op: str,
monitor: Monitor = Monitor.NONE) -> xr.Dataset:
"""
Do arithmetic operations on the given dataset by providing a list of
arithmetic operations and the corresponding constant. The operations will
Expand All @@ -62,40 +64,45 @@ def ds_arithmetics(ds: DatasetLike.TYPE,
:param ds: The dataset to which to apply arithmetic operations
:param op: A comma separated list of arithmetic operations to apply
:param monitor: a progress monitor.
:return: The dataset with given arithmetic operations applied
"""
ds = DatasetLike.convert(ds)
retset = ds
for item in op.split(','):
item = item.strip()
if item[0] == '+':
retset = retset + float(item[1:])
elif item[0] == '-':
retset = retset - float(item[1:])
elif item[0] == '*':
retset = retset * float(item[1:])
elif item[0] == '/':
retset = retset / float(item[1:])
elif item[:] == 'log':
retset = xu.log(retset)
elif item[:] == 'log10':
retset = xu.log10(retset)
elif item[:] == 'log2':
retset = xu.log2(retset)
elif item[:] == 'log1p':
retset = xu.log1p(retset)
elif item[:] == 'exp':
retset = xu.exp(retset)
else:
raise ValueError('Arithmetic operation {} not'
' implemented.'.format(item[0]))
with monitor.starting('Calculate result', total_work=len(op.split(','))):
for item in op.split(','):
with monitor.child(1).observing("Calculate"):
item = item.strip()
if item[0] == '+':
retset = retset + float(item[1:])
elif item[0] == '-':
retset = retset - float(item[1:])
elif item[0] == '*':
retset = retset * float(item[1:])
elif item[0] == '/':
retset = retset / float(item[1:])
elif item[:] == 'log':
retset = xu.log(retset)
elif item[:] == 'log10':
retset = xu.log10(retset)
elif item[:] == 'log2':
retset = xu.log2(retset)
elif item[:] == 'log1p':
retset = xu.log1p(retset)
elif item[:] == 'exp':
retset = xu.exp(retset)
else:
raise ValueError('Arithmetic operation {} not'
' implemented.'.format(item[0]))

return retset


@op(tags=['arithmetic'], version='1.0')
@op_return(add_history=True)
def diff(ds: xr.Dataset, ds2: xr.Dataset) -> xr.Dataset:
def diff(ds: xr.Dataset,
ds2: xr.Dataset,
monitor: Monitor = Monitor.NONE) -> xr.Dataset:
"""
Calculate the difference of two datasets (ds - ds2). This is done by
matching variable names in the two datasets against each other and taking
Expand All @@ -111,6 +118,7 @@ def diff(ds: xr.Dataset, ds2: xr.Dataset) -> xr.Dataset:
:param ds: The minuend dataset
:param ds2: The subtrahend dataset
:param monitor: a progress monitor.
:return: The difference dataset
"""
try:
Expand Down Expand Up @@ -143,4 +151,7 @@ def diff(ds: xr.Dataset, ds2: xr.Dataset) -> xr.Dataset:
else:
raise TypeError(str(e))

return ds - ds2
with monitor.observing("Subtract datasets"):
diff = ds - ds2

return diff
104 changes: 61 additions & 43 deletions cate/ops/correlation.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@

from cate.core.op import op, op_input, op_return
from cate.core.types import VarName, DatasetLike
from cate.util import Monitor

from cate.ops.normalize import adjust_spatial_attrs

Expand All @@ -50,7 +51,8 @@
def pearson_correlation_scalar(ds_x: DatasetLike.TYPE,
ds_y: DatasetLike.TYPE,
var_x: VarName.TYPE,
var_y: VarName.TYPE) -> pd.DataFrame:
var_y: VarName.TYPE,
monitor: Monitor = Monitor.NONE) -> pd.DataFrame:
"""
Do product moment `Pearson's correlation <http://www.statsoft.com/Textbook/Statistics-Glossary/P/button/p#Pearson%20Correlation>`_ analysis.
Expand All @@ -68,6 +70,7 @@ def pearson_correlation_scalar(ds_x: DatasetLike.TYPE,
:param ds_y: The 'y' dataset
:param var_x: Dataset variable to use for correlation analysis in the 'variable' dataset
:param var_y: Dataset variable to use for correlation analysis in the 'dependent' dataset
:param monitor: a progress monitor.
:returns: {'corr_coef': correlation coefficient, 'p_value': probability value}
"""
ds_x = DatasetLike.convert(ds_x)
Expand All @@ -94,7 +97,9 @@ def pearson_correlation_scalar(ds_x: DatasetLike.TYPE,
raise ValueError('The length of the time dimension should not be less'
' than three to run the calculation.')

cc, pv = pearsonr(array_x.values, array_y.values)
with monitor.observing("Calculate Pearson correlation"):
cc, pv = pearsonr(array_x.values, array_y.values)

return pd.DataFrame({'corr_coef': [cc], 'p_value': [pv]})


Expand All @@ -107,7 +112,8 @@ def pearson_correlation_scalar(ds_x: DatasetLike.TYPE,
def pearson_correlation(ds_x: DatasetLike.TYPE,
ds_y: DatasetLike.TYPE,
var_x: VarName.TYPE,
var_y: VarName.TYPE) -> xr.Dataset:
var_y: VarName.TYPE,
monitor: Monitor = Monitor.NONE) -> xr.Dataset:
"""
Do product moment `Pearson's correlation <http://www.statsoft.com/Textbook/Statistics-Glossary/P/button/p#Pearson%20Correlation>`_ analysis.
Expand Down Expand Up @@ -137,6 +143,7 @@ def pearson_correlation(ds_x: DatasetLike.TYPE,
:param ds_y: The 'y' dataset
:param var_x: Dataset variable to use for correlation analysis in the 'variable' dataset
:param var_y: Dataset variable to use for correlation analysis in the 'dependent' dataset
:param monitor: a progress monitor.
:returns: a dataset containing a map of correlation coefficients and p_values
"""
ds_x = DatasetLike.convert(ds_x)
Expand Down Expand Up @@ -189,13 +196,13 @@ def pearson_correlation(ds_x: DatasetLike.TYPE,
' than three to run the calculation.')

# Do pixel by pixel correlation
retset = _pearsonr(array_x, array_y)
retset = _pearsonr(array_x, array_y, monitor)
retset.attrs['Cate_Description'] = 'Correlation between {} {}'.format(var_y, var_x)

return adjust_spatial_attrs(retset)


def _pearsonr(x: xr.DataArray, y: xr.DataArray) -> xr.Dataset:
def _pearsonr(x: xr.DataArray, y: xr.DataArray, monitor: Monitor) -> xr.Dataset:
"""
Calculates Pearson correlation coefficients and p-values for testing
non-correlation of lon/lat/time xarray datasets for each lon/lat point.
Expand All @@ -218,49 +225,60 @@ def _pearsonr(x: xr.DataArray, y: xr.DataArray) -> xr.Dataset:
:param x: lon/lat/time xr.DataArray
:param y: xr.DataArray of the same spatiotemporal extents and resolution as
x.
:param monitor: Monitor to use for monitoring the calculation
:return: A dataset containing the correlation coefficients and p_values on
the lon/lat grid of x and y.
References
----------
http://www.statsoft.com/textbook/glosp.html#Pearson%20Correlation
"""
n = len(x['time'])

xm, ym = x - x.mean(dim='time'), y - y.mean(dim='time')
xm_ym = xm * ym
r_num = xm_ym.sum(dim='time')
xm_squared = xr.ufuncs.square(xm)
ym_squared = xr.ufuncs.square(ym)
r_den = xr.ufuncs.sqrt(xm_squared.sum(dim='time') *
ym_squared.sum(dim='time'))
r_den = r_den.where(r_den != 0)
r = r_num / r_den

# Presumably, if abs(r) > 1, then it is only some small artifact of floating
# point arithmetic.
# At this point r should be a lon/lat dataArray, so it should be safe to
# load it in memory explicitly. This may take time as it will kick-start
# deferred processing.
# Comparing with NaN produces warnings that can be safely ignored
default_warning_settings = np.seterr(invalid='ignore')
r.values[r.values < -1.0] = -1.0
r.values[r.values > 1.0] = 1.0
np.seterr(**default_warning_settings)
r.attrs = {'description': 'Correlation coefficients between'
' {} and {}.'.format(x.name, y.name)}

df = n - 2
t_squared = xr.ufuncs.square(r) * (df / ((1.0 - r.where(r != 1)) *
(1.0 + r.where(r != -1))))
prob = df / (df + t_squared)
prob.values = betainc(0.5 * df, 0.5, prob.values)
prob.attrs = {'description': 'Rough indicator of probability of an'
' uncorrelated system producing datasets that have a Pearson'
' correlation at least as extreme as the one computed from'
' these datsets. Not entirely reliable, but reasonable for'
' datasets larger than 500 or so.'}

retset = xr.Dataset({'corr_coef': r,
'p_value': prob})
with monitor.starting("Calculate Pearson correlation", total_work=6):
n = len(x['time'])

xm, ym = x - x.mean(dim='time'), y - y.mean(dim='time')
xm_ym = xm * ym
r_num = xm_ym.sum(dim='time')
xm_squared = xr.ufuncs.square(xm)
ym_squared = xr.ufuncs.square(ym)
r_den = xr.ufuncs.sqrt(xm_squared.sum(dim='time') *
ym_squared.sum(dim='time'))
r_den = r_den.where(r_den != 0)
r = r_num / r_den

# Presumably, if abs(r) > 1, then it is only some small artifact of floating
# point arithmetic.
# At this point r should be a lon/lat dataArray, so it should be safe to
# load it in memory explicitly. This may take time as it will kick-start
# deferred processing.
# Comparing with NaN produces warnings that can be safely ignored
default_warning_settings = np.seterr(invalid='ignore')
with monitor.child(1).observing("task 1"):
negativ_r = r.values < -1.0
with monitor.child(1).observing("task 2"):
r.values[negativ_r] = -1.0
with monitor.child(1).observing("task 3"):
positiv_r = r.values > 1.0
with monitor.child(1).observing("task 4"):
r.values[positiv_r] = 1.0
np.seterr(**default_warning_settings)
r.attrs = {'description': 'Correlation coefficients between'
' {} and {}.'.format(x.name, y.name)}

df = n - 2
t_squared = xr.ufuncs.square(r) * (df / ((1.0 - r.where(r != 1)) *
(1.0 + r.where(r != -1))))
prob = df / (df + t_squared)
with monitor.child(1).observing("task 5"):
prob_values_in = prob.values
with monitor.child(1).observing("task 6"):
prob.values = betainc(0.5 * df, 0.5, prob_values_in)
prob.attrs = {'description': 'Rough indicator of probability of an'
' uncorrelated system producing datasets that have a Pearson'
' correlation at least as extreme as the one computed from'
' these datsets. Not entirely reliable, but reasonable for'
' datasets larger than 500 or so.'}

retset = xr.Dataset({'corr_coef': r,
'p_value': prob})
return retset
Loading

0 comments on commit c1155d6

Please sign in to comment.