From b05d5e71cd6a4d2940cb3435eaf8b791358b7531 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath <3190405+shwina@users.noreply.github.com> Date: Mon, 17 Apr 2023 10:05:20 -0400 Subject: [PATCH] Add Python bindings for time zone data (TZiF) reader (#12826) This PR adds bindings to the TZiF reader that was added in the libcudf API in https://github.com/rapidsai/cudf/pull/12805. No tests are being added as these bindings are just for internal-use. In follow-up PRs, I will add a timezone-aware datetime type and timezone-aware operations to the public API, along with tests for those operations. The bindings can be used as follows: ```python >>> transition_times, offsets = make_timezone_transition_table("/usr/share/zoneinfo", "America/New_York") >>> transition_times [ 1883-11-18 17:00:00, 1883-11-18 17:00:00, 1918-03-31 07:00:00, 1918-10-27 06:00:00, 1919-03-30 07:00:00, 1919-10-26 06:00:00, 1920-03-28 07:00:00, 1920-10-31 06:00:00, 1921-04-24 07:00:00, 1921-09-25 06:00:00, ... 2365-03-14 07:00:00, 2365-11-07 06:00:00, 2366-03-13 07:00:00, 2366-11-06 06:00:00, 2367-03-12 07:00:00, 2367-11-05 06:00:00, 2368-03-10 07:00:00, 2368-11-03 06:00:00, 2369-03-09 07:00:00, 2369-11-02 06:00:00 ] dtype: datetime64[s] >>> offsets [ -18000, -18000, -14400, -18000, -14400, -18000, -14400, -18000, -14400, -18000, ... -14400, -18000, -14400, -18000, -14400, -18000, -14400, -18000, -14400, -18000 ] dtype: timedelta64[s] ``` Authors: - Ashwin Srinath (https://github.com/shwina) - Vukasin Milovanovic (https://github.com/vuule) Approvers: - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cudf/pull/12826 --- python/cudf/cudf/_lib/CMakeLists.txt | 1 + python/cudf/cudf/_lib/__init__.py | 1 + python/cudf/cudf/_lib/cpp/io/timezone.pxd | 15 ++++ python/cudf/cudf/_lib/cpp/libcpp/optional.pxd | 50 +++++++++++++ python/cudf/cudf/_lib/timezone.pyx | 28 ++++++++ python/cudf/cudf/core/_internals/timezones.py | 71 +++++++++++++++++++ 6 files changed, 166 insertions(+) create mode 100644 python/cudf/cudf/_lib/cpp/io/timezone.pxd create mode 100644 python/cudf/cudf/_lib/cpp/libcpp/optional.pxd create mode 100644 python/cudf/cudf/_lib/timezone.pyx create mode 100644 python/cudf/cudf/core/_internals/timezones.py diff --git a/python/cudf/cudf/_lib/CMakeLists.txt b/python/cudf/cudf/_lib/CMakeLists.txt index f7d4f12ad81..9391555a272 100644 --- a/python/cudf/cudf/_lib/CMakeLists.txt +++ b/python/cudf/cudf/_lib/CMakeLists.txt @@ -48,6 +48,7 @@ set(cython_sources string_casting.pyx strings_udf.pyx text.pyx + timezone.pyx transform.pyx transpose.pyx types.pyx diff --git a/python/cudf/cudf/_lib/__init__.py b/python/cudf/cudf/_lib/__init__.py index b101db9a744..09227def4e7 100644 --- a/python/cudf/cudf/_lib/__init__.py +++ b/python/cudf/cudf/_lib/__init__.py @@ -35,6 +35,7 @@ strings, strings_udf, text, + timezone, transpose, unary, ) diff --git a/python/cudf/cudf/_lib/cpp/io/timezone.pxd b/python/cudf/cudf/_lib/cpp/io/timezone.pxd new file mode 100644 index 00000000000..ba481d9a1d3 --- /dev/null +++ b/python/cudf/cudf/_lib/cpp/io/timezone.pxd @@ -0,0 +1,15 @@ +# Copyright (c) 2020-2023, NVIDIA CORPORATION. + +from libcpp cimport bool +from libcpp.memory cimport unique_ptr +from libcpp.string cimport string + +from cudf._lib.cpp.libcpp.optional cimport optional +from cudf._lib.cpp.table.table cimport table + + +cdef extern from "cudf/timezone.hpp" namespace "cudf" nogil: + unique_ptr[table] make_timezone_transition_table( + optional[string] tzif_dir, + string timezone_name + ) except + diff --git a/python/cudf/cudf/_lib/cpp/libcpp/optional.pxd b/python/cudf/cudf/_lib/cpp/libcpp/optional.pxd new file mode 100644 index 00000000000..a78c18f3f7a --- /dev/null +++ b/python/cudf/cudf/_lib/cpp/libcpp/optional.pxd @@ -0,0 +1,50 @@ +# SPDX-FileCopyrightText: Copyright (c) 2023, NVIDIA CORPORATION & +# AFFILIATES. All rights reserved. SPDX-License-Identifier: +# Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from libcpp cimport bool + + +cdef extern from "" namespace "std" nogil: + cdef cppclass nullopt_t: + nullopt_t() + + cdef nullopt_t nullopt + + cdef cppclass optional[T]: + ctypedef T value_type + optional() + optional(nullopt_t) + optional(optional&) except + + optional(T&) except + + bool has_value() + T& value() + T& value_or[U](U& default_value) + void swap(optional&) + void reset() + T& emplace(...) + T& operator*() + optional& operator=(optional&) + optional& operator=[U](U&) + bool operator bool() + bool operator!() + bool operator==[U](optional&, U&) + bool operator!=[U](optional&, U&) + bool operator<[U](optional&, U&) + bool operator>[U](optional&, U&) + bool operator<=[U](optional&, U&) + bool operator>=[U](optional&, U&) + + optional[T] make_optional[T](...) except + diff --git a/python/cudf/cudf/_lib/timezone.pyx b/python/cudf/cudf/_lib/timezone.pyx new file mode 100644 index 00000000000..4d76cbfcdb5 --- /dev/null +++ b/python/cudf/cudf/_lib/timezone.pyx @@ -0,0 +1,28 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. + +from libcpp.memory cimport unique_ptr +from libcpp.string cimport string +from libcpp.utility cimport move + +from cudf._lib.cpp.io.timezone cimport ( + make_timezone_transition_table as cpp_make_timezone_transition_table, +) +from cudf._lib.cpp.libcpp.optional cimport make_optional +from cudf._lib.cpp.table.table cimport table +from cudf._lib.utils cimport columns_from_unique_ptr + + +def make_timezone_transition_table(tzdir, tzname): + cdef unique_ptr[table] c_result + cdef string c_tzdir = tzdir.encode() + cdef string c_tzname = tzname.encode() + + with nogil: + c_result = move( + cpp_make_timezone_transition_table( + make_optional[string](c_tzdir), + c_tzname + ) + ) + + return columns_from_unique_ptr(move(c_result)) diff --git a/python/cudf/cudf/core/_internals/timezones.py b/python/cudf/cudf/core/_internals/timezones.py new file mode 100644 index 00000000000..0cc5db57c9c --- /dev/null +++ b/python/cudf/cudf/core/_internals/timezones.py @@ -0,0 +1,71 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. + +import os +import zoneinfo +from functools import lru_cache + +from cudf._lib.timezone import build_timezone_transition_table +from cudf.core.dataframe import DataFrame + + +@lru_cache(maxsize=20) +def get_tz_data(zone_name): + """ + Return timezone data (transition times and UTC offsets) for the + given IANA time zone. + + Parameters + ---------- + zone_name: str + IANA time zone name + + Returns + ------- + DataFrame with two columns containing the transition times ("dt") + and corresponding UTC offsets ("offset"). + """ + try: + # like zoneinfo, we first look in TZPATH + return _find_and_read_tzfile_tzpath(zone_name) + except zoneinfo.ZoneInfoNotFoundError: + # if that fails, we fall back to using `tzdata` + return _find_and_read_tzfile_tzdata(zone_name) + + +def _find_and_read_tzfile_tzpath(zone_name): + for search_path in zoneinfo.TZPATH: + if os.path.isfile(os.path.join(search_path, zone_name)): + return _read_tzfile_as_frame(search_path, zone_name) + raise zoneinfo.ZoneInfoNotFoundError(zone_name) + + +def _find_and_read_tzfile_tzdata(zone_name): + import importlib.resources + + package_base = "tzdata.zoneinfo" + try: + return _read_tzfile_as_frame( + str(importlib.resources.files(package_base)), zone_name + ) + # TODO: make it so that the call to libcudf raises a + # FileNotFoundError instead of a RuntimeError + except (ImportError, FileNotFoundError, UnicodeEncodeError, RuntimeError): + # the "except" part of this try-except is basically vendored + # from the zoneinfo library. + # + # There are three types of exception that can be raised that all amount + # to "we cannot find this key": + # + # ImportError: If package_name doesn't exist (e.g. if tzdata is not + # installed, or if there's an error in the folder name like + # Amrica/New_York) + # FileNotFoundError: If resource_name doesn't exist in the package + # (e.g. Europe/Krasnoy) + # UnicodeEncodeError: If package_name or resource_name are not UTF-8, + # such as keys containing a surrogate character. + raise zoneinfo.ZoneInfoNotFoundError(zone_name) + + +def _read_tzfile_as_frame(tzdir, zone_name): + dt, offsets = build_timezone_transition_table(tzdir, zone_name) + return DataFrame._from_columns([dt, offsets], ["dt", "offsets"])