Skip to content

Commit

Permalink
Add Python bindings for time zone data (TZiF) reader (#12826)
Browse files Browse the repository at this point in the history
This PR adds bindings to the TZiF reader that was added in the libcudf API in #12805.

No tests are being added as these bindings are just for internal-use. In follow-up PRs, I will add a timezone-aware datetime type and timezone-aware operations to the public API, along with tests for those operations.

The bindings can be used as follows:

```python
>>> transition_times, offsets = make_timezone_transition_table("/usr/share/zoneinfo", "America/New_York")
                                            
>>> transition_times
<cudf.core.column.datetime.DatetimeColumn object at 0x7f95cd6ac840>
[
  1883-11-18 17:00:00,
  1883-11-18 17:00:00,
  1918-03-31 07:00:00,
  1918-10-27 06:00:00,
  1919-03-30 07:00:00,
  1919-10-26 06:00:00,
  1920-03-28 07:00:00,
  1920-10-31 06:00:00,
  1921-04-24 07:00:00,
  1921-09-25 06:00:00,
  ...
  2365-03-14 07:00:00,
  2365-11-07 06:00:00,
  2366-03-13 07:00:00,
  2366-11-06 06:00:00,
  2367-03-12 07:00:00,
  2367-11-05 06:00:00,
  2368-03-10 07:00:00,
  2368-11-03 06:00:00,
  2369-03-09 07:00:00,
  2369-11-02 06:00:00
]
dtype: datetime64[s]

>>> offsets
<cudf.core.column.timedelta.TimeDeltaColumn object at 0x7f94e69bad40>
[
  -18000,
  -18000,
  -14400,
  -18000,
  -14400,
  -18000,
  -14400,
  -18000,
  -14400,
  -18000,
  ...
  -14400,
  -18000,
  -14400,
  -18000,
  -14400,
  -18000,
  -14400,
  -18000,
  -14400,
  -18000
]
dtype: timedelta64[s]
```

Authors:
  - Ashwin Srinath (https://github.com/shwina)
  - Vukasin Milovanovic (https://github.com/vuule)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: #12826
  • Loading branch information
shwina authored Apr 17, 2023
1 parent 7c3a34e commit b05d5e7
Show file tree
Hide file tree
Showing 6 changed files with 166 additions and 0 deletions.
1 change: 1 addition & 0 deletions python/cudf/cudf/_lib/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ set(cython_sources
string_casting.pyx
strings_udf.pyx
text.pyx
timezone.pyx
transform.pyx
transpose.pyx
types.pyx
Expand Down
1 change: 1 addition & 0 deletions python/cudf/cudf/_lib/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
strings,
strings_udf,
text,
timezone,
transpose,
unary,
)
Expand Down
15 changes: 15 additions & 0 deletions python/cudf/cudf/_lib/cpp/io/timezone.pxd
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
# Copyright (c) 2020-2023, NVIDIA CORPORATION.

from libcpp cimport bool
from libcpp.memory cimport unique_ptr
from libcpp.string cimport string

from cudf._lib.cpp.libcpp.optional cimport optional
from cudf._lib.cpp.table.table cimport table


cdef extern from "cudf/timezone.hpp" namespace "cudf" nogil:
unique_ptr[table] make_timezone_transition_table(
optional[string] tzif_dir,
string timezone_name
) except +
50 changes: 50 additions & 0 deletions python/cudf/cudf/_lib/cpp/libcpp/optional.pxd
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
# SPDX-FileCopyrightText: Copyright (c) 2023, NVIDIA CORPORATION &
# AFFILIATES. All rights reserved. SPDX-License-Identifier:
# Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from libcpp cimport bool


cdef extern from "<optional>" namespace "std" nogil:
cdef cppclass nullopt_t:
nullopt_t()

cdef nullopt_t nullopt

cdef cppclass optional[T]:
ctypedef T value_type
optional()
optional(nullopt_t)
optional(optional&) except +
optional(T&) except +
bool has_value()
T& value()
T& value_or[U](U& default_value)
void swap(optional&)
void reset()
T& emplace(...)
T& operator*()
optional& operator=(optional&)
optional& operator=[U](U&)
bool operator bool()
bool operator!()
bool operator==[U](optional&, U&)
bool operator!=[U](optional&, U&)
bool operator<[U](optional&, U&)
bool operator>[U](optional&, U&)
bool operator<=[U](optional&, U&)
bool operator>=[U](optional&, U&)

optional[T] make_optional[T](...) except +
28 changes: 28 additions & 0 deletions python/cudf/cudf/_lib/timezone.pyx
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
# Copyright (c) 2023, NVIDIA CORPORATION.

from libcpp.memory cimport unique_ptr
from libcpp.string cimport string
from libcpp.utility cimport move

from cudf._lib.cpp.io.timezone cimport (
make_timezone_transition_table as cpp_make_timezone_transition_table,
)
from cudf._lib.cpp.libcpp.optional cimport make_optional
from cudf._lib.cpp.table.table cimport table
from cudf._lib.utils cimport columns_from_unique_ptr


def make_timezone_transition_table(tzdir, tzname):
cdef unique_ptr[table] c_result
cdef string c_tzdir = tzdir.encode()
cdef string c_tzname = tzname.encode()

with nogil:
c_result = move(
cpp_make_timezone_transition_table(
make_optional[string](c_tzdir),
c_tzname
)
)

return columns_from_unique_ptr(move(c_result))
71 changes: 71 additions & 0 deletions python/cudf/cudf/core/_internals/timezones.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
# Copyright (c) 2023, NVIDIA CORPORATION.

import os
import zoneinfo
from functools import lru_cache

from cudf._lib.timezone import build_timezone_transition_table
from cudf.core.dataframe import DataFrame


@lru_cache(maxsize=20)
def get_tz_data(zone_name):
"""
Return timezone data (transition times and UTC offsets) for the
given IANA time zone.
Parameters
----------
zone_name: str
IANA time zone name
Returns
-------
DataFrame with two columns containing the transition times ("dt")
and corresponding UTC offsets ("offset").
"""
try:
# like zoneinfo, we first look in TZPATH
return _find_and_read_tzfile_tzpath(zone_name)
except zoneinfo.ZoneInfoNotFoundError:
# if that fails, we fall back to using `tzdata`
return _find_and_read_tzfile_tzdata(zone_name)


def _find_and_read_tzfile_tzpath(zone_name):
for search_path in zoneinfo.TZPATH:
if os.path.isfile(os.path.join(search_path, zone_name)):
return _read_tzfile_as_frame(search_path, zone_name)
raise zoneinfo.ZoneInfoNotFoundError(zone_name)


def _find_and_read_tzfile_tzdata(zone_name):
import importlib.resources

package_base = "tzdata.zoneinfo"
try:
return _read_tzfile_as_frame(
str(importlib.resources.files(package_base)), zone_name
)
# TODO: make it so that the call to libcudf raises a
# FileNotFoundError instead of a RuntimeError
except (ImportError, FileNotFoundError, UnicodeEncodeError, RuntimeError):
# the "except" part of this try-except is basically vendored
# from the zoneinfo library.
#
# There are three types of exception that can be raised that all amount
# to "we cannot find this key":
#
# ImportError: If package_name doesn't exist (e.g. if tzdata is not
# installed, or if there's an error in the folder name like
# Amrica/New_York)
# FileNotFoundError: If resource_name doesn't exist in the package
# (e.g. Europe/Krasnoy)
# UnicodeEncodeError: If package_name or resource_name are not UTF-8,
# such as keys containing a surrogate character.
raise zoneinfo.ZoneInfoNotFoundError(zone_name)


def _read_tzfile_as_frame(tzdir, zone_name):
dt, offsets = build_timezone_transition_table(tzdir, zone_name)
return DataFrame._from_columns([dt, offsets], ["dt", "offsets"])

0 comments on commit b05d5e7

Please sign in to comment.