diff --git a/examples/KWK_getcheckdata.py b/examples/KWK_getcheckdata.py index ae73a1f..126a610 100644 --- a/examples/KWK_getcheckdata.py +++ b/examples/KWK_getcheckdata.py @@ -27,14 +27,13 @@ except ModuleNotFoundError: dfmt_available = False -# TODO: report wl/ext missings/duplicates/outliers in recent period 2000-2021 (based on data_summary csv's) -# TODO: visually check availability (start/stop/gaps/aggers) of wl/ext, monthmean wl, outliers. Create new issues if needed: https://github.com/Deltares-research/kenmerkendewaarden/issues/4 +# TODO: overview of data issues in https://github.com/Deltares-research/kenmerkendewaarden/issues/4 # TODO: all TODOS in this script retrieve_meas_amount = False -plot_meas_amount = True +plot_meas_amount = False retrieve_data = False -create_summary = False +create_summary = True test = False @@ -76,10 +75,11 @@ bool_grootheid = locations["Grootheid.Code"].isin(["WATHTE"]) bool_groepering_ts = locations["Groepering.Code"].isin(["NVT"]) -bool_groepering_ext = locations["Groepering.Code"].isin(["GETETM2","GETETMSL2"]) # TODO: why distinction between MSL and NAP? This is already filtered via Hoedanigheid +bool_groepering_ext = locations["Groepering.Code"].isin(["GETETM2","GETETMSL2"]) # TODO: why distinction between MSL and NAP? This is already filtered via Hoedanigheid >> maybe fixed in WADAR/aquostandard +# TODO: for now we do not separately retrieve NAP and MSL for EURPFM/LICHELGRE which have both sets (https://github.com/Rijkswaterstaat/wm-ws-dl/issues/17), these stations are skipped # bool_hoedanigheid_nap = locations["Hoedanigheid.Code"].isin(["NAP"]) # bool_hoedanigheid_msl = locations["Hoedanigheid.Code"].isin(["MSL"]) -# TODO: we cannot subset Typering.Code==GETETTPE here (not present in dataframe), so we use Grootheid.Code==NVT: https://github.com/Rijkswaterstaat/wm-ws-dl/issues/19 +# TODO: we cannot subset Typering.Code==GETETTPE here (not present in dataframe), so we use Grootheid.Code==NVT: https://github.com/Rijkswaterstaat/wm-ws-dl/issues/19 >> maybe fixed in WADAR/aquostandard bool_grootheid_exttypes = locations['Grootheid.Code'].isin(['NVT']) # select locations on grootheid/groepering/exttypes @@ -112,7 +112,8 @@ print(f'{locs_meas_ts.loc[bool_isstation,["Naam","Locatie_MessageID","Hoedanigheid.Code"]]}') print() -# TODO: fix multiple hits (two rows probably raises error in retrieve_data code below): https://github.com/Rijkswaterstaat/wm-ws-dl/issues/12 + +# TODO: fix multiple hits (two location rows raise errors in retrieve_data code below so are skipped) """ # from station_list_tk station name BATH found 2 times, should be 1: @@ -152,8 +153,15 @@ NES Nes 10309 NAP """ -# TODO: missing/duplicate stations reported in https://github.com/Rijkswaterstaat/wm-ws-dl/issues/39. Many of these are not retrieved since we use clean_df for ddlpy -# TODO: some stations are now realtime instead of hist +# skip duplicate code stations (hist/realtime) # TODO: avoid this https://github.com/Rijkswaterstaat/wm-ws-dl/issues/12 +stations_realtime_hist_dupl = ["BATH", "D15", "J6", "NES"] +# skip MSL/NAP duplicate stations # TODO: avoid this: https://github.com/Rijkswaterstaat/wm-ws-dl/issues/17 +stations_nap_mls_dupl = ["EURPFM", "LICHTELGRE"] +stations_dupl = stations_realtime_hist_dupl + stations_nap_mls_dupl + + +# TODO: missings/duplicates reported in https://github.com/Rijkswaterstaat/wm-ws-dl/issues/39. Some of the duplicates are not retrieved since we use clean_df in ddlpy +# TODO: some stations are now realtime instead of hist, these are skipped in actual data retrieval/statistics ### RETRIEVE MEASUREMENTS AMOUNT ts_amount_list = [] ext_amount_list = [] @@ -170,7 +178,7 @@ amount_ts = ddlpy.measurements_amount(location=loc_meas_ts_one.iloc[0], start_date=start_date, end_date=end_date) amount_ts_clean = amount_ts.set_index("Groeperingsperiode").rename(columns={"AantalMetingen":current_station}) if amount_ts_clean.index.duplicated().any(): - # TODO: multiple 1993 in dataframe for BATH, because of multiple waardebepalingsmethoden/meetapparaten: https://github.com/Deltares/ddlpy/issues/92 + # TODO: multiple 1993 in dataframe for BATH, because of multiple waardebepalingsmethoden/meetapparaten: https://github.com/Deltares/ddlpy/issues/92. This will also prevent the need for "set_index here" amount_ts_clean = amount_ts_clean.groupby(amount_ts_clean.index).sum() ts_amount_list.append(amount_ts_clean) @@ -223,7 +231,7 @@ -### RETRIEVE DATA FROM DDL AND WRITE TO PICKLE +### RETRIEVE DATA FROM DDL AND WRITE TO NETCDF drop_if_constant = ["WaarnemingMetadata.OpdrachtgevendeInstantieLijst", "WaarnemingMetadata.BemonsteringshoogteLijst", "WaarnemingMetadata.ReferentievlakLijst", @@ -234,16 +242,11 @@ "WaardeBepalingsmethode.Code", "MeetApparaat.Code", ] - for current_station in station_list: if not retrieve_data: continue - # skip duplicate code stations (hist/realtime) # TODO: avoid this - if current_station in ["BATH", "D15", "J6", "NES"]: - continue - # skip MSL/NAP duplicate stations # TODO: avoid this - if current_station in ["EURPFM", "LICHTELGRE"]: + if current_station in stations_dupl: continue # write to netcdf instead (including metadata) @@ -289,7 +292,7 @@ meas_ext_ds = ddlpy.dataframe_to_xarray(measurements_ext, drop_if_constant) #convert extreme type to HWLWcode add extreme type and HWLcode as dataset variables - # TODO: simplify by retrieving the extreme value and type from ddl in a single request (not supported yet) + # TODO: simplify by retrieving the extreme value and type from ddl in a single request (not supported yet): https://github.com/Rijkswaterstaat/wm-ws-dl/issues/19 ts_meas_extval_pd = hatyan.ddlpy_to_hatyan(measurements_ext) ts_meas_exttype_pd = hatyan.ddlpy_to_hatyan(measurements_exttyp) ts_meas_ext_pd = hatyan.convert_HWLWstr2num(ts_meas_extval_pd, ts_meas_exttype_pd) @@ -304,6 +307,10 @@ for current_station in station_list: if not create_summary: continue + + if current_station in stations_dupl: + continue + print(f'checking data for {current_station}') data_summary_row_ts = {} data_summary_row_ext = {} @@ -313,8 +320,6 @@ sumrow['Code'] = locs_meas_ts.loc[current_station].name sumrow['RDx'] = locs_meas_ts.loc[current_station,'RDx'] sumrow['RDy'] = locs_meas_ts.loc[current_station,'RDy'] - time_interest_start = dt.datetime(2000,1,1) - time_interest_stop = dt.datetime(2021,2,1) #load measwl data file_wl_nc = os.path.join(dir_meas,f"{current_station}_measwl.nc") @@ -330,7 +335,7 @@ meta_dict_flat_ts = kw.get_flat_meta_from_dataset(ds_ts_meas) data_summary_row_ts.update(meta_dict_flat_ts) - ds_stats = kw.get_stats_from_dataset(ds_ts_meas, time_interest_start=time_interest_start, time_interest_stop=time_interest_stop) + ds_stats = kw.get_stats_from_dataset(ds_ts_meas) data_summary_row_ts.update(ds_stats) # calculate monthly/yearly mean for meas wl data @@ -361,11 +366,12 @@ meta_dict_flat_ext = kw.get_flat_meta_from_dataset(ds_ext_meas) data_summary_row_ext.update(meta_dict_flat_ext) - ds_stats = kw.get_stats_from_dataset(ds_ext_meas, time_interest_start=time_interest_start, time_interest_stop=time_interest_stop) + # TODO: warns about extremes being too close for BERGSDSWT, BROUWHVSGT02, BROUWHVSGT08, HOEKVHLD and probably more, probably due to aggers + ds_stats = kw.get_stats_from_dataset(ds_ext_meas) data_summary_row_ext.update(ds_stats) #calculate monthly/yearly mean for meas ext data - #TODO: make kw function (exact or approximation?) + #TODO: make kw function (exact or approximation?), also for timeseries ts_meas_ext_pd = kw.xarray_to_hatyan(ds_ext_meas) if len(ts_meas_ext_pd['HWLWcode'].unique()) > 2: data_pd_HWLW_12 = hatyan.calc_HWLW12345to12(ts_meas_ext_pd) #convert 12345 to 12 by taking minimum of 345 as 2 (laagste laagwater). TODO: currently, first/last values are skipped if LW @@ -394,33 +400,31 @@ row_list_ext.append(pd.Series(data_summary_row_ext)) #plotting - file_wl_png = os.path.join(dir_meas,f'ts_{current_station}.png') - # if os.path.exists(file_wl_png): - # continue #skip the plotting if there is already a png available + if not os.path.exists(file_wl_nc): + print("[NO DATA, skipping plot]") + continue if os.path.exists(file_ext_nc): fig,(ax1,ax2) = hatyan.plot_timeseries(ts=ts_meas_pd, ts_ext=ts_meas_ext_pd) else: fig,(ax1,ax2) = hatyan.plot_timeseries(ts=ts_meas_pd) ax1.set_title(f'timeseries for {current_station}') - ax1_legendlabels = ax1.get_legend_handles_labels()[1] - ax2_legendlabels = ['zero'] - ax1_legendlabels.insert(1,'zero') #legend for zero line was not displayed but will be now so it needs to be added - ax1_legendlabels[0] = 'measured waterlevels' - ax1_legendlabels[2] = 'mean' - ax1.plot(mean_peryearmonth_long,'c',linewidth=0.7); ax1_legendlabels.append('monthly mean') - ax1.plot(mean_peryear_long,'m',linewidth=0.7); ax1_legendlabels.append('yearly mean') - ax2.plot(mean_peryearmonth_long,'c',linewidth=0.7); ax2_legendlabels.append('monthly mean') - ax2.plot(mean_peryear_long,'m',linewidth=0.7); ax2_legendlabels.append('yearly mean') + ax1.plot(mean_peryearmonth_long,'c',linewidth=0.7, label='monthly mean') + ax1.plot(mean_peryear_long,'m',linewidth=0.7, label='yearly mean') + ax2.plot(mean_peryearmonth_long,'c',linewidth=0.7, label='monthly mean') + ax2.plot(mean_peryear_long,'m',linewidth=0.7, label='yearly mean') + if os.path.exists(file_ext_nc): + ax1.plot(HW_mean_peryear_long,'m',linewidth=0.7, label=None) #'yearly mean HW') + ax1.plot(LW_mean_peryear_long,'m',linewidth=0.7, label=None) #'yearly mean LW') ax1.set_ylim(-4,4) - ax1.legend(ax1_legendlabels,loc=4) - ax2.legend(ax2_legendlabels,loc=1) - if os.path.exists(file_ext_nc): #plot after legend creation, so these entries are not included - ax1.plot(HW_mean_peryear_long,'m',linewidth=0.7)#; ax1_legendlabels.append('yearly mean') - ax1.plot(LW_mean_peryear_long,'m',linewidth=0.7)#; ax1_legendlabels.append('yearly mean') + ax1.legend(loc=4) + ax2.legend(loc=1) ax2.set_ylim(-0.5,0.5) ax1.set_xlim(fig_alltimes_xlim) # entire period + + # save figure + file_wl_png = os.path.join(dir_meas,f'ts_{current_station}.png') fig.savefig(file_wl_png.replace('.png','_alldata.png')) - ax1.set_xlim(dt.datetime(2000,1,1),dt.datetime(2022,1,1)) # period of interest + ax1.set_xlim(dt.datetime(2000,1,1),dt.datetime(2024,1,1)) # period of interest fig.savefig(file_wl_png) plt.close(fig) @@ -433,17 +437,11 @@ data_summary_ts.to_csv(os.path.join(dir_meas,'data_summary_ts.csv')) data_summary_ext.to_csv(os.path.join(dir_meas,'data_summary_ext.csv')) - #print and save data_summary - print(data_summary_ts[['data_wl','tstart','tstop','nvals','dupltimes','#nans','#nans_2000to202102']]) - try: - print(data_summary_ext[['data_ext','dupltimes','#HWgaps_2000to202102']]) - except KeyError: - print(data_summary_ext[['data_ext','dupltimes']]) #make spatial plot of available/retrieved stations fig_map,ax_map = plt.subplots(figsize=(8,7)) - ax_map.plot(data_summary_ts['RDx'], data_summary_ts['RDy'],'xk')#,alpha=0.4) #all ext stations + ax_map.plot(data_summary_ext['RDx'], data_summary_ext['RDy'],'xk')#,alpha=0.4) #all ext stations ax_map.plot(data_summary_ext.loc[data_summary_ext['data_ext'],'RDx'], data_summary_ext.loc[data_summary_ext['data_ext'],'RDy'],'xr') # selected ext stations (stat_list) """ for iR, row in data_summary_ts.iterrows(): diff --git a/kenmerkendewaarden/utils.py b/kenmerkendewaarden/utils.py index 085d10d..d2f703c 100644 --- a/kenmerkendewaarden/utils.py +++ b/kenmerkendewaarden/utils.py @@ -48,7 +48,7 @@ def get_flat_meta_from_dataset(ds): return meta_dict_flat -def get_stats_from_dataset(ds, time_interest_start=None, time_interest_stop=None): +def get_stats_from_dataset(ds): ds_stats = {} # TODO: beware on timezones @@ -75,19 +75,7 @@ def get_stats_from_dataset(ds, time_interest_start=None, time_interest_stop=None else: qc_none = False ds_stats['qc_none'] = qc_none - - if time_interest_start is not None or time_interest_stop is not None: - #calc #nan-values in recent period - # TODO: generalize interest period - ds_2000to202102 = ds.sel(time=slice(time_interest_start,time_interest_stop)) - ts_timediff_2000to202102 = ds.time.to_pandas().index.diff()[1:] - ds_stats['tstart2000'] = ds.time.to_pandas().min()<=time_interest_start - ds_stats['tstop202102'] = ds.time.to_pandas().max()>=time_interest_stop - ds_stats['nvals_2000to202102'] = len(ds_2000to202102['Meetwaarde.Waarde_Numeriek']) - ds_stats['#nans_2000to202102'] = ds_2000to202102['Meetwaarde.Waarde_Numeriek'].isnull().values.sum() - ds_stats['mintimediff_2000to202102'] = str(ts_timediff_2000to202102.min()) - ds_stats['maxtimediff_2000to202102'] = str(ts_timediff_2000to202102.max()) - + if "HWLWcode" in ds.data_vars: #TODO: should be based on 12 only, not 345 (HOEKVHLD now gives warning) if ts_timediff.min() < pd.Timedelta(hours=4): #TODO: min timediff for e.g. BROUWHVSGT08 is 3 minutes: ts_meas_ext_pd.loc[dt.datetime(2015,1,1):dt.datetime(2015,1,2),['values', 'QC', 'Status']]. This should not happen and with new dataset should be converted to an error