Skip to content

Commit

Permalink
18 simplify kwget stats from dataset (#19)
Browse files Browse the repository at this point in the history
* simplify statistics function

* skip duplicate stations in statistics

* simplified plot and improved comments
  • Loading branch information
veenstrajelmer authored Apr 24, 2024
1 parent 1a8748e commit 6323a64
Show file tree
Hide file tree
Showing 2 changed files with 48 additions and 62 deletions.
94 changes: 46 additions & 48 deletions examples/KWK_getcheckdata.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,14 +27,13 @@
except ModuleNotFoundError:
dfmt_available = False

# TODO: report wl/ext missings/duplicates/outliers in recent period 2000-2021 (based on data_summary csv's)
# TODO: visually check availability (start/stop/gaps/aggers) of wl/ext, monthmean wl, outliers. Create new issues if needed: https://github.com/Deltares-research/kenmerkendewaarden/issues/4
# TODO: overview of data issues in https://github.com/Deltares-research/kenmerkendewaarden/issues/4
# TODO: all TODOS in this script

retrieve_meas_amount = False
plot_meas_amount = True
plot_meas_amount = False
retrieve_data = False
create_summary = False
create_summary = True
test = False


Expand Down Expand Up @@ -76,10 +75,11 @@

bool_grootheid = locations["Grootheid.Code"].isin(["WATHTE"])
bool_groepering_ts = locations["Groepering.Code"].isin(["NVT"])
bool_groepering_ext = locations["Groepering.Code"].isin(["GETETM2","GETETMSL2"]) # TODO: why distinction between MSL and NAP? This is already filtered via Hoedanigheid
bool_groepering_ext = locations["Groepering.Code"].isin(["GETETM2","GETETMSL2"]) # TODO: why distinction between MSL and NAP? This is already filtered via Hoedanigheid >> maybe fixed in WADAR/aquostandard
# TODO: for now we do not separately retrieve NAP and MSL for EURPFM/LICHELGRE which have both sets (https://github.com/Rijkswaterstaat/wm-ws-dl/issues/17), these stations are skipped
# bool_hoedanigheid_nap = locations["Hoedanigheid.Code"].isin(["NAP"])
# bool_hoedanigheid_msl = locations["Hoedanigheid.Code"].isin(["MSL"])
# TODO: we cannot subset Typering.Code==GETETTPE here (not present in dataframe), so we use Grootheid.Code==NVT: https://github.com/Rijkswaterstaat/wm-ws-dl/issues/19
# TODO: we cannot subset Typering.Code==GETETTPE here (not present in dataframe), so we use Grootheid.Code==NVT: https://github.com/Rijkswaterstaat/wm-ws-dl/issues/19 >> maybe fixed in WADAR/aquostandard
bool_grootheid_exttypes = locations['Grootheid.Code'].isin(['NVT'])

# select locations on grootheid/groepering/exttypes
Expand Down Expand Up @@ -112,7 +112,8 @@
print(f'{locs_meas_ts.loc[bool_isstation,["Naam","Locatie_MessageID","Hoedanigheid.Code"]]}')
print()

# TODO: fix multiple hits (two rows probably raises error in retrieve_data code below): https://github.com/Rijkswaterstaat/wm-ws-dl/issues/12

# TODO: fix multiple hits (two location rows raise errors in retrieve_data code below so are skipped)
"""
# from station_list_tk
station name BATH found 2 times, should be 1:
Expand Down Expand Up @@ -152,8 +153,15 @@
NES Nes 10309 NAP
"""

# TODO: missing/duplicate stations reported in https://github.com/Rijkswaterstaat/wm-ws-dl/issues/39. Many of these are not retrieved since we use clean_df for ddlpy
# TODO: some stations are now realtime instead of hist
# skip duplicate code stations (hist/realtime) # TODO: avoid this https://github.com/Rijkswaterstaat/wm-ws-dl/issues/12
stations_realtime_hist_dupl = ["BATH", "D15", "J6", "NES"]
# skip MSL/NAP duplicate stations # TODO: avoid this: https://github.com/Rijkswaterstaat/wm-ws-dl/issues/17
stations_nap_mls_dupl = ["EURPFM", "LICHTELGRE"]
stations_dupl = stations_realtime_hist_dupl + stations_nap_mls_dupl


# TODO: missings/duplicates reported in https://github.com/Rijkswaterstaat/wm-ws-dl/issues/39. Some of the duplicates are not retrieved since we use clean_df in ddlpy
# TODO: some stations are now realtime instead of hist, these are skipped in actual data retrieval/statistics
### RETRIEVE MEASUREMENTS AMOUNT
ts_amount_list = []
ext_amount_list = []
Expand All @@ -170,7 +178,7 @@
amount_ts = ddlpy.measurements_amount(location=loc_meas_ts_one.iloc[0], start_date=start_date, end_date=end_date)
amount_ts_clean = amount_ts.set_index("Groeperingsperiode").rename(columns={"AantalMetingen":current_station})
if amount_ts_clean.index.duplicated().any():
# TODO: multiple 1993 in dataframe for BATH, because of multiple waardebepalingsmethoden/meetapparaten: https://github.com/Deltares/ddlpy/issues/92
# TODO: multiple 1993 in dataframe for BATH, because of multiple waardebepalingsmethoden/meetapparaten: https://github.com/Deltares/ddlpy/issues/92. This will also prevent the need for "set_index here"
amount_ts_clean = amount_ts_clean.groupby(amount_ts_clean.index).sum()
ts_amount_list.append(amount_ts_clean)

Expand Down Expand Up @@ -223,7 +231,7 @@



### RETRIEVE DATA FROM DDL AND WRITE TO PICKLE
### RETRIEVE DATA FROM DDL AND WRITE TO NETCDF
drop_if_constant = ["WaarnemingMetadata.OpdrachtgevendeInstantieLijst",
"WaarnemingMetadata.BemonsteringshoogteLijst",
"WaarnemingMetadata.ReferentievlakLijst",
Expand All @@ -234,16 +242,11 @@
"WaardeBepalingsmethode.Code", "MeetApparaat.Code",
]


for current_station in station_list:
if not retrieve_data:
continue

# skip duplicate code stations (hist/realtime) # TODO: avoid this
if current_station in ["BATH", "D15", "J6", "NES"]:
continue
# skip MSL/NAP duplicate stations # TODO: avoid this
if current_station in ["EURPFM", "LICHTELGRE"]:
if current_station in stations_dupl:
continue

# write to netcdf instead (including metadata)
Expand Down Expand Up @@ -289,7 +292,7 @@
meas_ext_ds = ddlpy.dataframe_to_xarray(measurements_ext, drop_if_constant)

#convert extreme type to HWLWcode add extreme type and HWLcode as dataset variables
# TODO: simplify by retrieving the extreme value and type from ddl in a single request (not supported yet)
# TODO: simplify by retrieving the extreme value and type from ddl in a single request (not supported yet): https://github.com/Rijkswaterstaat/wm-ws-dl/issues/19
ts_meas_extval_pd = hatyan.ddlpy_to_hatyan(measurements_ext)
ts_meas_exttype_pd = hatyan.ddlpy_to_hatyan(measurements_exttyp)
ts_meas_ext_pd = hatyan.convert_HWLWstr2num(ts_meas_extval_pd, ts_meas_exttype_pd)
Expand All @@ -304,6 +307,10 @@
for current_station in station_list:
if not create_summary:
continue

if current_station in stations_dupl:
continue

print(f'checking data for {current_station}')
data_summary_row_ts = {}
data_summary_row_ext = {}
Expand All @@ -313,8 +320,6 @@
sumrow['Code'] = locs_meas_ts.loc[current_station].name
sumrow['RDx'] = locs_meas_ts.loc[current_station,'RDx']
sumrow['RDy'] = locs_meas_ts.loc[current_station,'RDy']
time_interest_start = dt.datetime(2000,1,1)
time_interest_stop = dt.datetime(2021,2,1)

#load measwl data
file_wl_nc = os.path.join(dir_meas,f"{current_station}_measwl.nc")
Expand All @@ -330,7 +335,7 @@
meta_dict_flat_ts = kw.get_flat_meta_from_dataset(ds_ts_meas)
data_summary_row_ts.update(meta_dict_flat_ts)

ds_stats = kw.get_stats_from_dataset(ds_ts_meas, time_interest_start=time_interest_start, time_interest_stop=time_interest_stop)
ds_stats = kw.get_stats_from_dataset(ds_ts_meas)
data_summary_row_ts.update(ds_stats)

# calculate monthly/yearly mean for meas wl data
Expand Down Expand Up @@ -361,11 +366,12 @@
meta_dict_flat_ext = kw.get_flat_meta_from_dataset(ds_ext_meas)
data_summary_row_ext.update(meta_dict_flat_ext)

ds_stats = kw.get_stats_from_dataset(ds_ext_meas, time_interest_start=time_interest_start, time_interest_stop=time_interest_stop)
# TODO: warns about extremes being too close for BERGSDSWT, BROUWHVSGT02, BROUWHVSGT08, HOEKVHLD and probably more, probably due to aggers
ds_stats = kw.get_stats_from_dataset(ds_ext_meas)
data_summary_row_ext.update(ds_stats)

#calculate monthly/yearly mean for meas ext data
#TODO: make kw function (exact or approximation?)
#TODO: make kw function (exact or approximation?), also for timeseries
ts_meas_ext_pd = kw.xarray_to_hatyan(ds_ext_meas)
if len(ts_meas_ext_pd['HWLWcode'].unique()) > 2:
data_pd_HWLW_12 = hatyan.calc_HWLW12345to12(ts_meas_ext_pd) #convert 12345 to 12 by taking minimum of 345 as 2 (laagste laagwater). TODO: currently, first/last values are skipped if LW
Expand Down Expand Up @@ -394,33 +400,31 @@
row_list_ext.append(pd.Series(data_summary_row_ext))

#plotting
file_wl_png = os.path.join(dir_meas,f'ts_{current_station}.png')
# if os.path.exists(file_wl_png):
# continue #skip the plotting if there is already a png available
if not os.path.exists(file_wl_nc):
print("[NO DATA, skipping plot]")
continue
if os.path.exists(file_ext_nc):
fig,(ax1,ax2) = hatyan.plot_timeseries(ts=ts_meas_pd, ts_ext=ts_meas_ext_pd)
else:
fig,(ax1,ax2) = hatyan.plot_timeseries(ts=ts_meas_pd)
ax1.set_title(f'timeseries for {current_station}')
ax1_legendlabels = ax1.get_legend_handles_labels()[1]
ax2_legendlabels = ['zero']
ax1_legendlabels.insert(1,'zero') #legend for zero line was not displayed but will be now so it needs to be added
ax1_legendlabels[0] = 'measured waterlevels'
ax1_legendlabels[2] = 'mean'
ax1.plot(mean_peryearmonth_long,'c',linewidth=0.7); ax1_legendlabels.append('monthly mean')
ax1.plot(mean_peryear_long,'m',linewidth=0.7); ax1_legendlabels.append('yearly mean')
ax2.plot(mean_peryearmonth_long,'c',linewidth=0.7); ax2_legendlabels.append('monthly mean')
ax2.plot(mean_peryear_long,'m',linewidth=0.7); ax2_legendlabels.append('yearly mean')
ax1.plot(mean_peryearmonth_long,'c',linewidth=0.7, label='monthly mean')
ax1.plot(mean_peryear_long,'m',linewidth=0.7, label='yearly mean')
ax2.plot(mean_peryearmonth_long,'c',linewidth=0.7, label='monthly mean')
ax2.plot(mean_peryear_long,'m',linewidth=0.7, label='yearly mean')
if os.path.exists(file_ext_nc):
ax1.plot(HW_mean_peryear_long,'m',linewidth=0.7, label=None) #'yearly mean HW')
ax1.plot(LW_mean_peryear_long,'m',linewidth=0.7, label=None) #'yearly mean LW')
ax1.set_ylim(-4,4)
ax1.legend(ax1_legendlabels,loc=4)
ax2.legend(ax2_legendlabels,loc=1)
if os.path.exists(file_ext_nc): #plot after legend creation, so these entries are not included
ax1.plot(HW_mean_peryear_long,'m',linewidth=0.7)#; ax1_legendlabels.append('yearly mean')
ax1.plot(LW_mean_peryear_long,'m',linewidth=0.7)#; ax1_legendlabels.append('yearly mean')
ax1.legend(loc=4)
ax2.legend(loc=1)
ax2.set_ylim(-0.5,0.5)
ax1.set_xlim(fig_alltimes_xlim) # entire period

# save figure
file_wl_png = os.path.join(dir_meas,f'ts_{current_station}.png')
fig.savefig(file_wl_png.replace('.png','_alldata.png'))
ax1.set_xlim(dt.datetime(2000,1,1),dt.datetime(2022,1,1)) # period of interest
ax1.set_xlim(dt.datetime(2000,1,1),dt.datetime(2024,1,1)) # period of interest
fig.savefig(file_wl_png)
plt.close(fig)

Expand All @@ -433,17 +437,11 @@
data_summary_ts.to_csv(os.path.join(dir_meas,'data_summary_ts.csv'))
data_summary_ext.to_csv(os.path.join(dir_meas,'data_summary_ext.csv'))

#print and save data_summary
print(data_summary_ts[['data_wl','tstart','tstop','nvals','dupltimes','#nans','#nans_2000to202102']])
try:
print(data_summary_ext[['data_ext','dupltimes','#HWgaps_2000to202102']])
except KeyError:
print(data_summary_ext[['data_ext','dupltimes']])

#make spatial plot of available/retrieved stations
fig_map,ax_map = plt.subplots(figsize=(8,7))

ax_map.plot(data_summary_ts['RDx'], data_summary_ts['RDy'],'xk')#,alpha=0.4) #all ext stations
ax_map.plot(data_summary_ext['RDx'], data_summary_ext['RDy'],'xk')#,alpha=0.4) #all ext stations
ax_map.plot(data_summary_ext.loc[data_summary_ext['data_ext'],'RDx'], data_summary_ext.loc[data_summary_ext['data_ext'],'RDy'],'xr') # selected ext stations (stat_list)
"""
for iR, row in data_summary_ts.iterrows():
Expand Down
16 changes: 2 additions & 14 deletions kenmerkendewaarden/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ def get_flat_meta_from_dataset(ds):
return meta_dict_flat


def get_stats_from_dataset(ds, time_interest_start=None, time_interest_stop=None):
def get_stats_from_dataset(ds):
ds_stats = {}

# TODO: beware on timezones
Expand All @@ -75,19 +75,7 @@ def get_stats_from_dataset(ds, time_interest_start=None, time_interest_stop=None
else:
qc_none = False
ds_stats['qc_none'] = qc_none

if time_interest_start is not None or time_interest_stop is not None:
#calc #nan-values in recent period
# TODO: generalize interest period
ds_2000to202102 = ds.sel(time=slice(time_interest_start,time_interest_stop))
ts_timediff_2000to202102 = ds.time.to_pandas().index.diff()[1:]
ds_stats['tstart2000'] = ds.time.to_pandas().min()<=time_interest_start
ds_stats['tstop202102'] = ds.time.to_pandas().max()>=time_interest_stop
ds_stats['nvals_2000to202102'] = len(ds_2000to202102['Meetwaarde.Waarde_Numeriek'])
ds_stats['#nans_2000to202102'] = ds_2000to202102['Meetwaarde.Waarde_Numeriek'].isnull().values.sum()
ds_stats['mintimediff_2000to202102'] = str(ts_timediff_2000to202102.min())
ds_stats['maxtimediff_2000to202102'] = str(ts_timediff_2000to202102.max())


if "HWLWcode" in ds.data_vars:
#TODO: should be based on 12 only, not 345 (HOEKVHLD now gives warning)
if ts_timediff.min() < pd.Timedelta(hours=4): #TODO: min timediff for e.g. BROUWHVSGT08 is 3 minutes: ts_meas_ext_pd.loc[dt.datetime(2015,1,1):dt.datetime(2015,1,2),['values', 'QC', 'Status']]. This should not happen and with new dataset should be converted to an error
Expand Down

0 comments on commit 6323a64

Please sign in to comment.