Skip to content

Commit

Permalink
added time-sorting of returned dataframe (#37)
Browse files Browse the repository at this point in the history
* added time-sorting of returned dataframe

* updated history.rst

* made sorting+duplicatedropping optional

* added sorting/duplicated test
  • Loading branch information
veenstrajelmer authored Mar 1, 2024
1 parent 1806fcd commit 398f1d1
Show file tree
Hide file tree
Showing 3 changed files with 54 additions and 14 deletions.
1 change: 1 addition & 0 deletions HISTORY.rst
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ UNRELEASED
* improved nan filtering of measurements in https://github.com/openearth/ddlpy/pull/30
* add early return when no data in entire requested period in https://github.com/openearth/ddlpy/pull/33
* add `ddlpy.measurements_latest()` to retrieve latest measurements in https://github.com/openearth/ddlpy/pull/35
* add optional time-sorting of returned measurements dataframe and made drop_duplicates optional in https://github.com/openearth/ddlpy/pull/37

0.1.0 (2019-01-03)
------------------
Expand Down
16 changes: 12 additions & 4 deletions ddlpy/ddlpy.py
Original file line number Diff line number Diff line change
Expand Up @@ -253,7 +253,7 @@ def _measurements_slice(location, start_date, end_date):
return df


def measurements(location, start_date, end_date):
def measurements(location, start_date, end_date, clean_df=True):
"""return measurements for the given location and time window (start_date, end_date)"""
measurements = []

Expand Down Expand Up @@ -284,7 +284,15 @@ def measurements(location, start_date, end_date):
if len(measurements) > 0:
measurements = pd.concat(measurements)

# drop duplicate rows (preserves e.g. different Grootheden/Groeperingen at same timestep)
measurements = measurements.drop_duplicates()

if clean_df:
len_raw = len(measurements)
# drop duplicate rows (preserves e.g. different Grootheden/Groeperingen at same timestep)
measurements = measurements.drop_duplicates()
# sort dataframe on time, ddl returns non-sorted data
measurements = measurements.sort_values("t")
# reset index to be contiguous again
measurements = measurements.reset_index(drop=True)
ndropped = len_raw - len(measurements)
logger.debug(f"{ndropped} duplicated values dropped")

return measurements
51 changes: 41 additions & 10 deletions tests/test_ddlpy.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,10 @@
# -*- coding: utf-8 -*-

"""Tests for `ddlpy` package."""
import datetime

import datetime as dt
import pandas as pd
import pytest

from click.testing import CliRunner

import ddlpy
from ddlpy import cli

Expand All @@ -24,15 +22,15 @@ def location():
return location

def test_measurements_available(location):
start_date = datetime.datetime(1953, 1, 1)
end_date = datetime.datetime(1953, 4, 1)
start_date = dt.datetime(1953, 1, 1)
end_date = dt.datetime(1953, 4, 1)
data_present = ddlpy.ddlpy._measurements_available(location, start_date=start_date, end_date=end_date)
assert isinstance(data_present, bool)

def test_measurements(location):
"""measurements for a location """
start_date = datetime.datetime(1953, 1, 1)
end_date = datetime.datetime(1953, 4, 1)
start_date = dt.datetime(1953, 1, 1)
end_date = dt.datetime(1953, 4, 1)
measurements = ddlpy.measurements(location, start_date=start_date, end_date=end_date)
assert measurements.shape[0] > 1

Expand All @@ -43,10 +41,43 @@ def test_measurements_latest(location):

def test_measurements_long(location):
"""measurements for a location """
start_date = datetime.datetime(1951, 11, 1)
end_date = datetime.datetime(1953, 4, 1)
start_date = dt.datetime(1951, 11, 1)
end_date = dt.datetime(1953, 4, 1)
measurements = ddlpy.measurements(location, start_date=start_date, end_date=end_date)
assert measurements.shape[0] > 1

def test_measurements_sorted(location):
"""https://github.com/openearth/ddlpy/issues/27"""
# input parameters
start_date = dt.datetime(2019,11,24)
end_date = dt.datetime(2019,12,5)
meas_wathte = ddlpy.measurements(location, start_date=start_date, end_date=end_date)
assert meas_wathte["t"].is_monotonic_increasing == True
meas_wathte_clean = ddlpy.measurements(location, start_date=start_date, end_date=end_date, clean_df=True)
assert meas_wathte_clean["t"].is_monotonic_increasing == True
meas_wathte_raw = ddlpy.measurements(location, start_date=start_date, end_date=end_date, clean_df=False)
assert meas_wathte_raw["t"].is_monotonic_increasing == False
# check wheter indexes are contiguous (due to reset_index)
assert isinstance(meas_wathte_clean.index, pd.RangeIndex)
assert isinstance(meas_wathte_raw.index, pd.RangeIndex)

def test_measurements_duplicated(location):
"""
WALSODN 2010 contains all values three times, ddlpy drops duplicates
https://github.com/openearth/ddlpy/issues/24
if the data is cleaned in ddl, this test will fail and can be removed or adjusted
"""
locations = ddlpy.locations()
location = locations[locations['Grootheid.Code'] == 'WATHTE'].loc['WALSODN']
start_date = dt.datetime(2010, 1, 1)
end_date = dt.datetime(2010, 1, 1, 0, 20)
measurements_clean = ddlpy.measurements(location, start_date=start_date, end_date=end_date, clean_df=True)
measurements_raw = ddlpy.measurements(location, start_date=start_date, end_date=end_date, clean_df=False)
assert len(measurements_clean) == 3
assert len(measurements_raw) == 9
# check wheter indexes are contiguous (due to reset_index)
assert isinstance(measurements_clean.index, pd.RangeIndex)
assert isinstance(measurements_raw.index, pd.RangeIndex)

def test_command_line_interface():
"""Test the CLI."""
Expand Down

0 comments on commit 398f1d1

Please sign in to comment.