From 398f1d10279e74bb31f02c8cf08c53ef065d4b1f Mon Sep 17 00:00:00 2001 From: veenstrajelmer <60435591+veenstrajelmer@users.noreply.github.com> Date: Fri, 1 Mar 2024 20:48:36 +0100 Subject: [PATCH] added time-sorting of returned dataframe (#37) * added time-sorting of returned dataframe * updated history.rst * made sorting+duplicatedropping optional * added sorting/duplicated test --- HISTORY.rst | 1 + ddlpy/ddlpy.py | 16 ++++++++++---- tests/test_ddlpy.py | 51 ++++++++++++++++++++++++++++++++++++--------- 3 files changed, 54 insertions(+), 14 deletions(-) diff --git a/HISTORY.rst b/HISTORY.rst index 100ae33..223e5d6 100644 --- a/HISTORY.rst +++ b/HISTORY.rst @@ -7,6 +7,7 @@ UNRELEASED * improved nan filtering of measurements in https://github.com/openearth/ddlpy/pull/30 * add early return when no data in entire requested period in https://github.com/openearth/ddlpy/pull/33 * add `ddlpy.measurements_latest()` to retrieve latest measurements in https://github.com/openearth/ddlpy/pull/35 +* add optional time-sorting of returned measurements dataframe and made drop_duplicates optional in https://github.com/openearth/ddlpy/pull/37 0.1.0 (2019-01-03) ------------------ diff --git a/ddlpy/ddlpy.py b/ddlpy/ddlpy.py index 2255bf7..7f7415c 100644 --- a/ddlpy/ddlpy.py +++ b/ddlpy/ddlpy.py @@ -253,7 +253,7 @@ def _measurements_slice(location, start_date, end_date): return df -def measurements(location, start_date, end_date): +def measurements(location, start_date, end_date, clean_df=True): """return measurements for the given location and time window (start_date, end_date)""" measurements = [] @@ -284,7 +284,15 @@ def measurements(location, start_date, end_date): if len(measurements) > 0: measurements = pd.concat(measurements) - # drop duplicate rows (preserves e.g. different Grootheden/Groeperingen at same timestep) - measurements = measurements.drop_duplicates() - + if clean_df: + len_raw = len(measurements) + # drop duplicate rows (preserves e.g. different Grootheden/Groeperingen at same timestep) + measurements = measurements.drop_duplicates() + # sort dataframe on time, ddl returns non-sorted data + measurements = measurements.sort_values("t") + # reset index to be contiguous again + measurements = measurements.reset_index(drop=True) + ndropped = len_raw - len(measurements) + logger.debug(f"{ndropped} duplicated values dropped") + return measurements diff --git a/tests/test_ddlpy.py b/tests/test_ddlpy.py index f4fa83d..99f4287 100755 --- a/tests/test_ddlpy.py +++ b/tests/test_ddlpy.py @@ -2,12 +2,10 @@ # -*- coding: utf-8 -*- """Tests for `ddlpy` package.""" -import datetime - +import datetime as dt +import pandas as pd import pytest - from click.testing import CliRunner - import ddlpy from ddlpy import cli @@ -24,15 +22,15 @@ def location(): return location def test_measurements_available(location): - start_date = datetime.datetime(1953, 1, 1) - end_date = datetime.datetime(1953, 4, 1) + start_date = dt.datetime(1953, 1, 1) + end_date = dt.datetime(1953, 4, 1) data_present = ddlpy.ddlpy._measurements_available(location, start_date=start_date, end_date=end_date) assert isinstance(data_present, bool) def test_measurements(location): """measurements for a location """ - start_date = datetime.datetime(1953, 1, 1) - end_date = datetime.datetime(1953, 4, 1) + start_date = dt.datetime(1953, 1, 1) + end_date = dt.datetime(1953, 4, 1) measurements = ddlpy.measurements(location, start_date=start_date, end_date=end_date) assert measurements.shape[0] > 1 @@ -43,10 +41,43 @@ def test_measurements_latest(location): def test_measurements_long(location): """measurements for a location """ - start_date = datetime.datetime(1951, 11, 1) - end_date = datetime.datetime(1953, 4, 1) + start_date = dt.datetime(1951, 11, 1) + end_date = dt.datetime(1953, 4, 1) measurements = ddlpy.measurements(location, start_date=start_date, end_date=end_date) assert measurements.shape[0] > 1 + +def test_measurements_sorted(location): + """https://github.com/openearth/ddlpy/issues/27""" + # input parameters + start_date = dt.datetime(2019,11,24) + end_date = dt.datetime(2019,12,5) + meas_wathte = ddlpy.measurements(location, start_date=start_date, end_date=end_date) + assert meas_wathte["t"].is_monotonic_increasing == True + meas_wathte_clean = ddlpy.measurements(location, start_date=start_date, end_date=end_date, clean_df=True) + assert meas_wathte_clean["t"].is_monotonic_increasing == True + meas_wathte_raw = ddlpy.measurements(location, start_date=start_date, end_date=end_date, clean_df=False) + assert meas_wathte_raw["t"].is_monotonic_increasing == False + # check wheter indexes are contiguous (due to reset_index) + assert isinstance(meas_wathte_clean.index, pd.RangeIndex) + assert isinstance(meas_wathte_raw.index, pd.RangeIndex) + +def test_measurements_duplicated(location): + """ + WALSODN 2010 contains all values three times, ddlpy drops duplicates + https://github.com/openearth/ddlpy/issues/24 + if the data is cleaned in ddl, this test will fail and can be removed or adjusted + """ + locations = ddlpy.locations() + location = locations[locations['Grootheid.Code'] == 'WATHTE'].loc['WALSODN'] + start_date = dt.datetime(2010, 1, 1) + end_date = dt.datetime(2010, 1, 1, 0, 20) + measurements_clean = ddlpy.measurements(location, start_date=start_date, end_date=end_date, clean_df=True) + measurements_raw = ddlpy.measurements(location, start_date=start_date, end_date=end_date, clean_df=False) + assert len(measurements_clean) == 3 + assert len(measurements_raw) == 9 + # check wheter indexes are contiguous (due to reset_index) + assert isinstance(measurements_clean.index, pd.RangeIndex) + assert isinstance(measurements_raw.index, pd.RangeIndex) def test_command_line_interface(): """Test the CLI."""