added time-sorting of returned dataframe (#37)

* added time-sorting of returned dataframe * updated history.rst * made sorting+duplicatedropping optional * added sorting/duplicated test
Deltares · Mar 1, 2024 · 398f1d1 · 398f1d1
1 parent 1806fcd
commit 398f1d1
Show file tree

Hide file tree

Showing 3 changed files with 54 additions and 14 deletions.
diff --git a/HISTORY.rst b/HISTORY.rst
@@ -7,6 +7,7 @@ UNRELEASED
 * improved nan filtering of measurements in https://github.com/openearth/ddlpy/pull/30
 * add early return when no data in entire requested period in https://github.com/openearth/ddlpy/pull/33
 * add `ddlpy.measurements_latest()` to retrieve latest measurements in https://github.com/openearth/ddlpy/pull/35
+* add optional time-sorting of returned measurements dataframe and made drop_duplicates optional in https://github.com/openearth/ddlpy/pull/37
 
 0.1.0 (2019-01-03)
 ------------------

diff --git a/ddlpy/ddlpy.py b/ddlpy/ddlpy.py
@@ -253,7 +253,7 @@ def _measurements_slice(location, start_date, end_date):
     return df
 
 
-def measurements(location, start_date, end_date):
+def measurements(location, start_date, end_date, clean_df=True):
     """return measurements for the given location and time window (start_date, end_date)"""
     measurements = []
 
@@ -284,7 +284,15 @@ def measurements(location, start_date, end_date):
     if len(measurements) > 0:
         measurements = pd.concat(measurements)
 
-        # drop duplicate rows (preserves e.g. different Grootheden/Groeperingen at same timestep)
-        measurements = measurements.drop_duplicates()
-
+        if clean_df:
+            len_raw = len(measurements)
+            # drop duplicate rows (preserves e.g. different Grootheden/Groeperingen at same timestep)
+            measurements = measurements.drop_duplicates()
+            # sort dataframe on time, ddl returns non-sorted data
+            measurements = measurements.sort_values("t")
+            # reset index to be contiguous again
+            measurements = measurements.reset_index(drop=True)
+            ndropped = len_raw - len(measurements)
+            logger.debug(f"{ndropped} duplicated values dropped")
+
     return measurements
diff --git a/tests/test_ddlpy.py b/tests/test_ddlpy.py
@@ -2,12 +2,10 @@
 # -*- coding: utf-8 -*-
 
 """Tests for `ddlpy` package."""
-import datetime
-
+import datetime as dt
+import pandas as pd
 import pytest
-
 from click.testing import CliRunner
-
 import ddlpy
 from ddlpy import cli
 
@@ -24,15 +22,15 @@ def location():
     return location
 
 def test_measurements_available(location):
-    start_date = datetime.datetime(1953, 1, 1)
-    end_date = datetime.datetime(1953, 4, 1)
+    start_date = dt.datetime(1953, 1, 1)
+    end_date = dt.datetime(1953, 4, 1)
     data_present = ddlpy.ddlpy._measurements_available(location, start_date=start_date, end_date=end_date)
     assert isinstance(data_present, bool)
 
 def test_measurements(location):
     """measurements for a location """
-    start_date = datetime.datetime(1953, 1, 1)
-    end_date = datetime.datetime(1953, 4, 1)
+    start_date = dt.datetime(1953, 1, 1)
+    end_date = dt.datetime(1953, 4, 1)
     measurements = ddlpy.measurements(location, start_date=start_date, end_date=end_date)
     assert measurements.shape[0] > 1
 
@@ -43,10 +41,43 @@ def test_measurements_latest(location):
 
 def test_measurements_long(location):
     """measurements for a location """
-    start_date = datetime.datetime(1951, 11, 1)
-    end_date = datetime.datetime(1953, 4, 1)
+    start_date = dt.datetime(1951, 11, 1)
+    end_date = dt.datetime(1953, 4, 1)
     measurements = ddlpy.measurements(location, start_date=start_date, end_date=end_date)
     assert measurements.shape[0] > 1
+
+def test_measurements_sorted(location):
+    """https://github.com/openearth/ddlpy/issues/27"""
+    # input parameters
+    start_date  = dt.datetime(2019,11,24)
+    end_date = dt.datetime(2019,12,5)
+    meas_wathte = ddlpy.measurements(location, start_date=start_date, end_date=end_date)
+    assert meas_wathte["t"].is_monotonic_increasing == True
+    meas_wathte_clean = ddlpy.measurements(location, start_date=start_date, end_date=end_date, clean_df=True)
+    assert meas_wathte_clean["t"].is_monotonic_increasing == True
+    meas_wathte_raw = ddlpy.measurements(location, start_date=start_date, end_date=end_date, clean_df=False)
+    assert meas_wathte_raw["t"].is_monotonic_increasing == False
+    # check wheter indexes are contiguous (due to reset_index)
+    assert isinstance(meas_wathte_clean.index, pd.RangeIndex)
+    assert isinstance(meas_wathte_raw.index, pd.RangeIndex)
+
+def test_measurements_duplicated(location):
+    """
+    WALSODN 2010 contains all values three times, ddlpy drops duplicates
+    https://github.com/openearth/ddlpy/issues/24
+    if the data is cleaned in ddl, this test will fail and can be removed or adjusted
+    """
+    locations = ddlpy.locations()
+    location = locations[locations['Grootheid.Code'] == 'WATHTE'].loc['WALSODN']
+    start_date = dt.datetime(2010, 1, 1)
+    end_date = dt.datetime(2010, 1, 1, 0, 20)
+    measurements_clean = ddlpy.measurements(location, start_date=start_date, end_date=end_date, clean_df=True)
+    measurements_raw = ddlpy.measurements(location, start_date=start_date, end_date=end_date, clean_df=False)
+    assert len(measurements_clean) == 3
+    assert len(measurements_raw) == 9
+    # check wheter indexes are contiguous (due to reset_index)
+    assert isinstance(measurements_clean.index, pd.RangeIndex)
+    assert isinstance(measurements_raw.index, pd.RangeIndex)
 
 def test_command_line_interface():
     """Test the CLI."""