Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

18 simplify kwget stats from dataset #19

Merged
merged 3 commits into from
Apr 24, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
94 changes: 46 additions & 48 deletions examples/KWK_getcheckdata.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,14 +27,13 @@
except ModuleNotFoundError:
dfmt_available = False

# TODO: report wl/ext missings/duplicates/outliers in recent period 2000-2021 (based on data_summary csv's)
# TODO: visually check availability (start/stop/gaps/aggers) of wl/ext, monthmean wl, outliers. Create new issues if needed: https://github.com/Deltares-research/kenmerkendewaarden/issues/4
# TODO: overview of data issues in https://github.com/Deltares-research/kenmerkendewaarden/issues/4
# TODO: all TODOS in this script

retrieve_meas_amount = False
plot_meas_amount = True
plot_meas_amount = False
retrieve_data = False
create_summary = False
create_summary = True
test = False


Expand Down Expand Up @@ -76,10 +75,11 @@

bool_grootheid = locations["Grootheid.Code"].isin(["WATHTE"])
bool_groepering_ts = locations["Groepering.Code"].isin(["NVT"])
bool_groepering_ext = locations["Groepering.Code"].isin(["GETETM2","GETETMSL2"]) # TODO: why distinction between MSL and NAP? This is already filtered via Hoedanigheid
bool_groepering_ext = locations["Groepering.Code"].isin(["GETETM2","GETETMSL2"]) # TODO: why distinction between MSL and NAP? This is already filtered via Hoedanigheid >> maybe fixed in WADAR/aquostandard
# TODO: for now we do not separately retrieve NAP and MSL for EURPFM/LICHELGRE which have both sets (https://github.com/Rijkswaterstaat/wm-ws-dl/issues/17), these stations are skipped
# bool_hoedanigheid_nap = locations["Hoedanigheid.Code"].isin(["NAP"])
# bool_hoedanigheid_msl = locations["Hoedanigheid.Code"].isin(["MSL"])
# TODO: we cannot subset Typering.Code==GETETTPE here (not present in dataframe), so we use Grootheid.Code==NVT: https://github.com/Rijkswaterstaat/wm-ws-dl/issues/19
# TODO: we cannot subset Typering.Code==GETETTPE here (not present in dataframe), so we use Grootheid.Code==NVT: https://github.com/Rijkswaterstaat/wm-ws-dl/issues/19 >> maybe fixed in WADAR/aquostandard
bool_grootheid_exttypes = locations['Grootheid.Code'].isin(['NVT'])

# select locations on grootheid/groepering/exttypes
Expand Down Expand Up @@ -112,7 +112,8 @@
print(f'{locs_meas_ts.loc[bool_isstation,["Naam","Locatie_MessageID","Hoedanigheid.Code"]]}')
print()

# TODO: fix multiple hits (two rows probably raises error in retrieve_data code below): https://github.com/Rijkswaterstaat/wm-ws-dl/issues/12

# TODO: fix multiple hits (two location rows raise errors in retrieve_data code below so are skipped)
"""
# from station_list_tk
station name BATH found 2 times, should be 1:
Expand Down Expand Up @@ -152,8 +153,15 @@
NES Nes 10309 NAP
"""

# TODO: missing/duplicate stations reported in https://github.com/Rijkswaterstaat/wm-ws-dl/issues/39. Many of these are not retrieved since we use clean_df for ddlpy
# TODO: some stations are now realtime instead of hist
# skip duplicate code stations (hist/realtime) # TODO: avoid this https://github.com/Rijkswaterstaat/wm-ws-dl/issues/12
stations_realtime_hist_dupl = ["BATH", "D15", "J6", "NES"]
# skip MSL/NAP duplicate stations # TODO: avoid this: https://github.com/Rijkswaterstaat/wm-ws-dl/issues/17
stations_nap_mls_dupl = ["EURPFM", "LICHTELGRE"]
stations_dupl = stations_realtime_hist_dupl + stations_nap_mls_dupl


# TODO: missings/duplicates reported in https://github.com/Rijkswaterstaat/wm-ws-dl/issues/39. Some of the duplicates are not retrieved since we use clean_df in ddlpy
# TODO: some stations are now realtime instead of hist, these are skipped in actual data retrieval/statistics
### RETRIEVE MEASUREMENTS AMOUNT
ts_amount_list = []
ext_amount_list = []
Expand All @@ -170,7 +178,7 @@
amount_ts = ddlpy.measurements_amount(location=loc_meas_ts_one.iloc[0], start_date=start_date, end_date=end_date)
amount_ts_clean = amount_ts.set_index("Groeperingsperiode").rename(columns={"AantalMetingen":current_station})
if amount_ts_clean.index.duplicated().any():
# TODO: multiple 1993 in dataframe for BATH, because of multiple waardebepalingsmethoden/meetapparaten: https://github.com/Deltares/ddlpy/issues/92
# TODO: multiple 1993 in dataframe for BATH, because of multiple waardebepalingsmethoden/meetapparaten: https://github.com/Deltares/ddlpy/issues/92. This will also prevent the need for "set_index here"
amount_ts_clean = amount_ts_clean.groupby(amount_ts_clean.index).sum()
ts_amount_list.append(amount_ts_clean)

Expand Down Expand Up @@ -223,7 +231,7 @@



### RETRIEVE DATA FROM DDL AND WRITE TO PICKLE
### RETRIEVE DATA FROM DDL AND WRITE TO NETCDF
drop_if_constant = ["WaarnemingMetadata.OpdrachtgevendeInstantieLijst",
"WaarnemingMetadata.BemonsteringshoogteLijst",
"WaarnemingMetadata.ReferentievlakLijst",
Expand All @@ -234,16 +242,11 @@
"WaardeBepalingsmethode.Code", "MeetApparaat.Code",
]


for current_station in station_list:
if not retrieve_data:
continue

# skip duplicate code stations (hist/realtime) # TODO: avoid this
if current_station in ["BATH", "D15", "J6", "NES"]:
continue
# skip MSL/NAP duplicate stations # TODO: avoid this
if current_station in ["EURPFM", "LICHTELGRE"]:
if current_station in stations_dupl:
continue

# write to netcdf instead (including metadata)
Expand Down Expand Up @@ -289,7 +292,7 @@
meas_ext_ds = ddlpy.dataframe_to_xarray(measurements_ext, drop_if_constant)

#convert extreme type to HWLWcode add extreme type and HWLcode as dataset variables
# TODO: simplify by retrieving the extreme value and type from ddl in a single request (not supported yet)
# TODO: simplify by retrieving the extreme value and type from ddl in a single request (not supported yet): https://github.com/Rijkswaterstaat/wm-ws-dl/issues/19
ts_meas_extval_pd = hatyan.ddlpy_to_hatyan(measurements_ext)
ts_meas_exttype_pd = hatyan.ddlpy_to_hatyan(measurements_exttyp)
ts_meas_ext_pd = hatyan.convert_HWLWstr2num(ts_meas_extval_pd, ts_meas_exttype_pd)
Expand All @@ -304,6 +307,10 @@
for current_station in station_list:
if not create_summary:
continue

if current_station in stations_dupl:
continue

print(f'checking data for {current_station}')
data_summary_row_ts = {}
data_summary_row_ext = {}
Expand All @@ -313,8 +320,6 @@
sumrow['Code'] = locs_meas_ts.loc[current_station].name
sumrow['RDx'] = locs_meas_ts.loc[current_station,'RDx']
sumrow['RDy'] = locs_meas_ts.loc[current_station,'RDy']
time_interest_start = dt.datetime(2000,1,1)
time_interest_stop = dt.datetime(2021,2,1)

#load measwl data
file_wl_nc = os.path.join(dir_meas,f"{current_station}_measwl.nc")
Expand All @@ -330,7 +335,7 @@
meta_dict_flat_ts = kw.get_flat_meta_from_dataset(ds_ts_meas)
data_summary_row_ts.update(meta_dict_flat_ts)

ds_stats = kw.get_stats_from_dataset(ds_ts_meas, time_interest_start=time_interest_start, time_interest_stop=time_interest_stop)
ds_stats = kw.get_stats_from_dataset(ds_ts_meas)
data_summary_row_ts.update(ds_stats)

# calculate monthly/yearly mean for meas wl data
Expand Down Expand Up @@ -361,11 +366,12 @@
meta_dict_flat_ext = kw.get_flat_meta_from_dataset(ds_ext_meas)
data_summary_row_ext.update(meta_dict_flat_ext)

ds_stats = kw.get_stats_from_dataset(ds_ext_meas, time_interest_start=time_interest_start, time_interest_stop=time_interest_stop)
# TODO: warns about extremes being too close for BERGSDSWT, BROUWHVSGT02, BROUWHVSGT08, HOEKVHLD and probably more, probably due to aggers
ds_stats = kw.get_stats_from_dataset(ds_ext_meas)
data_summary_row_ext.update(ds_stats)

#calculate monthly/yearly mean for meas ext data
#TODO: make kw function (exact or approximation?)
#TODO: make kw function (exact or approximation?), also for timeseries
ts_meas_ext_pd = kw.xarray_to_hatyan(ds_ext_meas)
if len(ts_meas_ext_pd['HWLWcode'].unique()) > 2:
data_pd_HWLW_12 = hatyan.calc_HWLW12345to12(ts_meas_ext_pd) #convert 12345 to 12 by taking minimum of 345 as 2 (laagste laagwater). TODO: currently, first/last values are skipped if LW
Expand Down Expand Up @@ -394,33 +400,31 @@
row_list_ext.append(pd.Series(data_summary_row_ext))

#plotting
file_wl_png = os.path.join(dir_meas,f'ts_{current_station}.png')
# if os.path.exists(file_wl_png):
# continue #skip the plotting if there is already a png available
if not os.path.exists(file_wl_nc):
print("[NO DATA, skipping plot]")
continue
if os.path.exists(file_ext_nc):
fig,(ax1,ax2) = hatyan.plot_timeseries(ts=ts_meas_pd, ts_ext=ts_meas_ext_pd)
else:
fig,(ax1,ax2) = hatyan.plot_timeseries(ts=ts_meas_pd)
ax1.set_title(f'timeseries for {current_station}')
ax1_legendlabels = ax1.get_legend_handles_labels()[1]
ax2_legendlabels = ['zero']
ax1_legendlabels.insert(1,'zero') #legend for zero line was not displayed but will be now so it needs to be added
ax1_legendlabels[0] = 'measured waterlevels'
ax1_legendlabels[2] = 'mean'
ax1.plot(mean_peryearmonth_long,'c',linewidth=0.7); ax1_legendlabels.append('monthly mean')
ax1.plot(mean_peryear_long,'m',linewidth=0.7); ax1_legendlabels.append('yearly mean')
ax2.plot(mean_peryearmonth_long,'c',linewidth=0.7); ax2_legendlabels.append('monthly mean')
ax2.plot(mean_peryear_long,'m',linewidth=0.7); ax2_legendlabels.append('yearly mean')
ax1.plot(mean_peryearmonth_long,'c',linewidth=0.7, label='monthly mean')
ax1.plot(mean_peryear_long,'m',linewidth=0.7, label='yearly mean')
ax2.plot(mean_peryearmonth_long,'c',linewidth=0.7, label='monthly mean')
ax2.plot(mean_peryear_long,'m',linewidth=0.7, label='yearly mean')
if os.path.exists(file_ext_nc):
ax1.plot(HW_mean_peryear_long,'m',linewidth=0.7, label=None) #'yearly mean HW')
ax1.plot(LW_mean_peryear_long,'m',linewidth=0.7, label=None) #'yearly mean LW')
ax1.set_ylim(-4,4)
ax1.legend(ax1_legendlabels,loc=4)
ax2.legend(ax2_legendlabels,loc=1)
if os.path.exists(file_ext_nc): #plot after legend creation, so these entries are not included
ax1.plot(HW_mean_peryear_long,'m',linewidth=0.7)#; ax1_legendlabels.append('yearly mean')
ax1.plot(LW_mean_peryear_long,'m',linewidth=0.7)#; ax1_legendlabels.append('yearly mean')
ax1.legend(loc=4)
ax2.legend(loc=1)
ax2.set_ylim(-0.5,0.5)
ax1.set_xlim(fig_alltimes_xlim) # entire period

# save figure
file_wl_png = os.path.join(dir_meas,f'ts_{current_station}.png')
fig.savefig(file_wl_png.replace('.png','_alldata.png'))
ax1.set_xlim(dt.datetime(2000,1,1),dt.datetime(2022,1,1)) # period of interest
ax1.set_xlim(dt.datetime(2000,1,1),dt.datetime(2024,1,1)) # period of interest
fig.savefig(file_wl_png)
plt.close(fig)

Expand All @@ -433,17 +437,11 @@
data_summary_ts.to_csv(os.path.join(dir_meas,'data_summary_ts.csv'))
data_summary_ext.to_csv(os.path.join(dir_meas,'data_summary_ext.csv'))

#print and save data_summary
print(data_summary_ts[['data_wl','tstart','tstop','nvals','dupltimes','#nans','#nans_2000to202102']])
try:
print(data_summary_ext[['data_ext','dupltimes','#HWgaps_2000to202102']])
except KeyError:
print(data_summary_ext[['data_ext','dupltimes']])

#make spatial plot of available/retrieved stations
fig_map,ax_map = plt.subplots(figsize=(8,7))

ax_map.plot(data_summary_ts['RDx'], data_summary_ts['RDy'],'xk')#,alpha=0.4) #all ext stations
ax_map.plot(data_summary_ext['RDx'], data_summary_ext['RDy'],'xk')#,alpha=0.4) #all ext stations
ax_map.plot(data_summary_ext.loc[data_summary_ext['data_ext'],'RDx'], data_summary_ext.loc[data_summary_ext['data_ext'],'RDy'],'xr') # selected ext stations (stat_list)
"""
for iR, row in data_summary_ts.iterrows():
Expand Down
16 changes: 2 additions & 14 deletions kenmerkendewaarden/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ def get_flat_meta_from_dataset(ds):
return meta_dict_flat


def get_stats_from_dataset(ds, time_interest_start=None, time_interest_stop=None):
def get_stats_from_dataset(ds):
ds_stats = {}

# TODO: beware on timezones
Expand All @@ -75,19 +75,7 @@ def get_stats_from_dataset(ds, time_interest_start=None, time_interest_stop=None
else:
qc_none = False
ds_stats['qc_none'] = qc_none

if time_interest_start is not None or time_interest_stop is not None:
#calc #nan-values in recent period
# TODO: generalize interest period
ds_2000to202102 = ds.sel(time=slice(time_interest_start,time_interest_stop))
ts_timediff_2000to202102 = ds.time.to_pandas().index.diff()[1:]
ds_stats['tstart2000'] = ds.time.to_pandas().min()<=time_interest_start
ds_stats['tstop202102'] = ds.time.to_pandas().max()>=time_interest_stop
ds_stats['nvals_2000to202102'] = len(ds_2000to202102['Meetwaarde.Waarde_Numeriek'])
ds_stats['#nans_2000to202102'] = ds_2000to202102['Meetwaarde.Waarde_Numeriek'].isnull().values.sum()
ds_stats['mintimediff_2000to202102'] = str(ts_timediff_2000to202102.min())
ds_stats['maxtimediff_2000to202102'] = str(ts_timediff_2000to202102.max())


if "HWLWcode" in ds.data_vars:
#TODO: should be based on 12 only, not 345 (HOEKVHLD now gives warning)
if ts_timediff.min() < pd.Timedelta(hours=4): #TODO: min timediff for e.g. BROUWHVSGT08 is 3 minutes: ts_meas_ext_pd.loc[dt.datetime(2015,1,1):dt.datetime(2015,1,2),['values', 'QC', 'Status']]. This should not happen and with new dataset should be converted to an error
Expand Down