From 11232a4a3497dd146b7c86766b2ed1061989ee72 Mon Sep 17 00:00:00 2001 From: Nicky Sandhu Date: Fri, 21 Feb 2020 17:06:06 -0800 Subject: [PATCH] fixes for issue #16: pandas timestamp limitations --- pyhecdss/__init__.py | 2 +- pyhecdss/pyhecdss.py | 92 ++++++++++++++++---------------- tests/test_pyhecdss.py | 2 + tests/test_pyhecdss_intrinsic.py | 7 +-- 4 files changed, 54 insertions(+), 49 deletions(-) diff --git a/pyhecdss/__init__.py b/pyhecdss/__init__.py index 8325d17..2c95b75 100644 --- a/pyhecdss/__init__.py +++ b/pyhecdss/__init__.py @@ -1,4 +1,4 @@ __author__ = """Nicky Sandhu""" __email__ = 'psandhu@water.ca.gov' -__version__ = "0.3.2" +__version__ = "0.4.0" from .pyhecdss import * diff --git a/pyhecdss/pyhecdss.py b/pyhecdss/pyhecdss.py index 6d6d89b..c6f852a 100755 --- a/pyhecdss/pyhecdss.py +++ b/pyhecdss/pyhecdss.py @@ -4,8 +4,12 @@ import os import time import warnings +from datetime import datetime, timedelta +from calendar import monthrange +from dateutil.parser import parse # some static functions +DATE_FMT_STR = '%d%b%Y' def set_message_level(level): """ @@ -59,6 +63,11 @@ class DSSFile: } EPART_FREQ_MAP = {v: k for k, v in FREQ_EPART_MAP.items()} # + """ + vectorized version of timedelta + """ + timedelta_minutes=np.vectorize(lambda x: timedelta(minutes=int(x))) + def __init__(self, fname): self.isopen = False @@ -216,13 +225,8 @@ def num_values_in_interval(self, sdstr, edstr, istr): Get number of values in interval istr, using the start date and end date string """ - if istr.find('MON') >= 0: # less number of estimates will lead to overestimating values - td = np.timedelta64(int(istr[:istr.find('MON')]), 'M') - elif istr.find('YEAR') >= 0: - td = np.timedelta64(int(istr[:istr.find('YEAR')]), 'Y') - else: - td = pd.to_timedelta(istr) - return int((pd.to_datetime(edstr)-pd.to_datetime(sdstr))/td)+1 + td=DSSFile._get_timedelta_for_interval(istr) + return int((parse(edstr)-parse(sdstr))/td)+1 def julian_day(self, date): """ @@ -242,39 +246,38 @@ def m2ihm(self, minute): def parse_pathname_epart(self, pathname): return pathname.split('/')[1:7][4] - def _number_between(startDateStr, endDateStr, delta=np.timedelta64(1, 'D')): + def _number_between(startDateStr, endDateStr, delta=timedelta(days=1)): """ This is just a guess at number of values to be read so going over is ok. """ - return round((pd.to_datetime(endDateStr)-pd.to_datetime(startDateStr))/delta+1) - - def _get_timedelta_unit(epart): - if 'YEAR' in epart: - return 'Y' - elif 'MON' in epart: - return 'M' - elif 'WEEK' in epart: - return 'W' - elif 'DAY' in epart: - return 'D' - elif 'HOUR' in epart: - return 'H' - elif 'MIN' in epart: - return 'm' + return round((parse(endDateStr)-parse(startDateStr))/delta+1) + + def _get_timedelta_for_interval(interval): + """ + get minimum timedelta for interval defined by string. e.g. for month it is 28 days (minimum) + """ + if interval.find('MON') >= 0: # less number of estimates will lead to overestimating values + td = timedelta(days=28) + elif interval.find('YEAR') >= 0: + td = timedelta(days=365) else: - raise Exception( - "Unknown epart to time delta conversion for epart=%s" % epart) + td = timedelta(seconds=DSSFile.EPART_FREQ_MAP[interval].nanos/1e9) + return td def _pad_to_end_of_block(self, endDateStr, interval): + edate=parse(endDateStr) if interval.find('MON') >= 0 or interval.find('YEAR') >= 0: - buffer = pd.DateOffset(years=10) + edate=datetime((edate.year//10+1)*10,1,1) elif interval.find('DAY') >= 0: - buffer = pd.DateOffset(years=1) + edate=datetime(edate.year+1,1,1) elif interval.find('HOUR') >= 0 or interval.find('MIN') >= 0: - buffer = pd.DateOffset(months=1) + if edate.month == 12: + edate=datetime(edate.year+1,1,1) + else: + edate=datetime(edate.year,edate.month+1,1) else: - buffer = pd.DateOffset(days=1) - return (pd.to_datetime(endDateStr) + buffer).strftime('%d%b%Y').upper() + edate = edate+timedelta(days=1) + return edate.strftime(DATE_FMT_STR).upper() def _get_istat_for_zrrtsxd(self, istat): """ @@ -352,9 +355,8 @@ def read_rts(self, pathname, startDateStr=None, endDateStr=None): endDateStr = edate.strip() endDateStr = self._pad_to_end_of_block( endDateStr, interval) - nvals = self.num_values_in_interval( - startDateStr, endDateStr, interval) - sdate = pd.to_datetime(startDateStr) + nvals = self.num_values_in_interval(startDateStr, endDateStr, interval) + sdate = parse(startDateStr) cdate = sdate.date().strftime('%d%b%Y').upper() ctime = ''.join(sdate.time().isoformat().split(':')[:2]) # PERF: could be np.empty if all initialized @@ -369,9 +371,9 @@ def read_rts(self, pathname, startDateStr=None, endDateStr=None): # FIXME: deal with non-zero iofset for period data,i.e. else part of if stmt below freqoffset = DSSFile.EPART_FREQ_MAP[interval] if ctype.startswith('INST'): - startDateWithOffset=pd.to_datetime(startDateStr) + startDateWithOffset=parse(startDateStr) if iofset !=0: - startDateWithOffset=pd.to_datetime(startDateStr)-freqoffset+pd.to_timedelta('%dT'%iofset) + startDateWithOffset=parse(startDateStr)-freqoffset+timedelta(minutes=iofset) dindex = pd.date_range( startDateWithOffset, periods=nvals, freq=freqoffset) else: @@ -430,10 +432,12 @@ def read_its(self, pathname, startDateStr=None, endDateStr=None, guess_vals_per_ if startDateStr == None or endDateStr == None: raise Exception( "Either pathname D PART contains timewindow or specify in startDateStr and endDateStr for this call") - startDateStr = (pd.to_datetime(startDateStr) - - pd.offsets.YearBegin(0)).strftime('%d%b%Y').upper() - endDateStr = (pd.to_datetime(endDateStr) + - pd.offsets.YearBegin(0)).strftime('%d%b%Y').upper() + nsdate = parse(startDateStr) + nsbdate= datetime(nsdate.year,1,1) + nedate = parse(endDateStr) + nebdate = datetime(nedate.year,1,1) + startDateStr = nsbdate.strftime(DATE_FMT_STR) + endDateStr = nebdate.strftime(DATE_FMT_STR) parts[4] = startDateStr+" - "+endDateStr else: tw = list(map(lambda x: x.strip(), parts[4].split('-'))) @@ -443,8 +447,7 @@ def read_its(self, pathname, startDateStr=None, endDateStr=None, guess_vals_per_ jule, istat = pyheclib.hec_datjul(endDateStr) ietime = istime = 0 # guess how many values to be read based on e part approximation - ktvals = DSSFile._number_between(startDateStr, endDateStr, - np.timedelta64(1, DSSFile._get_timedelta_unit(epart))) + ktvals = DSSFile._number_between(startDateStr, endDateStr, DSSFile._get_timedelta_for_interval(epart)) ktvals = guess_vals_per_block*int(ktvals) kdvals = ktvals itimes = np.zeros(ktvals, 'i') @@ -456,9 +459,8 @@ def read_its(self, pathname, startDateStr=None, endDateStr=None, guess_vals_per_ if nvals == ktvals: raise Exception( "More values than guessed! %d. Call with guess_vals_per_block > 10000 " % ktvals) - base_date = pd.to_datetime('31DEC1899')+pd.to_timedelta(ibdate, 'D') - df = pd.DataFrame(dvalues[:nvals], index=pd.to_timedelta( - itimes[:nvals], unit='m')+base_date, columns=[pathname]) + base_date = parse('31DEC1899')+timedelta(days=ibdate) + df = pd.DataFrame(dvalues[:nvals], index=base_date+DSSFile.timedelta_minutes(itimes[:nvals]), columns=[pathname]) return df, cunits.strip(), ctype.strip() # return nvals, dvalues, itimes, base_date, cunits, ctype @@ -498,7 +500,7 @@ def write_its(self, pathname, df, cunits, ctype, interval=None): jule, istat = pyheclib.hec_datjul(endDateStr) ietime = istime = 0 pathname = "/".join(parts) - itimes = df.index-pd.to_datetime(startDateStr) + itimes = df.index-parse(startDateStr) itimes = itimes.total_seconds()/60 # time in minutes since base date juls itimes = itimes.values.astype('i') # conver to integer numpy inflag = 1 # replace data (merging should be done in memory) diff --git a/tests/test_pyhecdss.py b/tests/test_pyhecdss.py index 68bbb1a..901db72 100644 --- a/tests/test_pyhecdss.py +++ b/tests/test_pyhecdss.py @@ -15,6 +15,8 @@ def setupClass(cls): os.remove('./test_its1.dsd') os.remove('./test.dsc') os.remove('./test.dsd') + os.remove('./test.dsc') + os.remove('./test.dsd') os.remove('./test.dsk') @classmethod def tearDownClass(cls): diff --git a/tests/test_pyhecdss_intrinsic.py b/tests/test_pyhecdss_intrinsic.py index 5ae2565..067c286 100644 --- a/tests/test_pyhecdss_intrinsic.py +++ b/tests/test_pyhecdss_intrinsic.py @@ -3,7 +3,8 @@ import pandas as pd import numpy as np import pyhecdss +from datetime import timedelta def test_number_between(): - assert pyhecdss.DSSFile._number_between('01JAN2000','01FEB2000',delta=pd.to_timedelta(1,'D')) > 31 - assert pyhecdss.DSSFile._number_between('01JAN2000','01FEB2000',delta=np.timedelta64(1,'M')) > 1 - assert pyhecdss.DSSFile._number_between('01JAN2000','01FEB2000',delta=np.timedelta64(1,'Y')) > 0 + assert pyhecdss.DSSFile._number_between('01JAN2000','01FEB2000',delta=timedelta(days=1)) > 31 + assert pyhecdss.DSSFile._number_between('01JAN2000','01FEB2000',delta=timedelta(days=28)) > 1 + assert pyhecdss.DSSFile._number_between('01JAN2000','01FEB2000',delta=timedelta(days=365)) > 0