Skip to content

Commit

Permalink
Add datetime parsing for executing UDFs at timestamps. (#653)
Browse files Browse the repository at this point in the history
* Add datetime parsing for executing UDFs at timestamps.

This change adds the initial step for executing UDFs at timestamps.
To allow users to specify a timestamp of a registered UDF to use,
we add `@` followed by a timestamp to the name of the UDF.

For example, to execute the version of `my/udf` as it was as of
2023-06-17 at 07:16 UTC, they would execute:

    "my/udf@2023-06-17 07:16"

This is unambiguous and avoids the need to add a new parameter, which
would be difficult to name and could shadow parameters in user code.

* Parse datetimes using the canonical T version.

* Allow spaces around the `@` delimiter.
  • Loading branch information
thetorpedodog authored Sep 19, 2024
1 parent e6bbebb commit 137cd14
Show file tree
Hide file tree
Showing 2 changed files with 70 additions and 1 deletion.
35 changes: 34 additions & 1 deletion src/tiledb/cloud/udf.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
import base64
import datetime
import uuid
import warnings
from typing import Any, Callable, Iterable, Optional, Union
from typing import Any, Callable, Iterable, Optional, Tuple, Union

import cloudpickle

Expand Down Expand Up @@ -172,6 +173,38 @@ def exec_async(*args, **kwargs) -> Any:
return sender.wrap_async_base_call(exec_base, *args, **kwargs)


_TIME_FORMATS = (
"%Y-%m-%d",
"%Y-%m-%dT%H:%M",
"%Y-%m-%dT%H:%M:%S",
"%Y-%m-%dT%H:%M:%S.%f",
)


def _parse_udf_name_timestamp(
full_name: str,
) -> Tuple[str, Optional[datetime.datetime]]:
name, at, ts_str = full_name.partition("@")
name = name.strip()
ts_str = ts_str.strip()
if not at:
# This means that "@" was not found in the string,
# and we're just running a normal UDF.
return name, None
ts_str = ts_str.replace(" ", "T")
for fmt in _TIME_FORMATS:
try:
naive_ts = datetime.datetime.strptime(ts_str, fmt)
except ValueError:
continue
return name, naive_ts.replace(tzinfo=datetime.timezone.utc)
raise ValueError(
f"Could not parse {ts_str} as a timestamp. "
"Timestamp must be formatted as yyyy-MM-dd[ HH:mm[:ss[.SSS]]], "
"and will interpreted as UTC."
)


def register_udf(
func,
name,
Expand Down
36 changes: 36 additions & 0 deletions tests/test_generic_udf.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import datetime
import unittest

import numpy as np
Expand Down Expand Up @@ -122,3 +123,38 @@ def test():

with self.assertRaises(tiledb_cloud_error.TileDBCloudError):
udf.exec(test, timeout=1)


class ParserTest(unittest.TestCase):
def test_parse_udf_name_timestamp(self) -> None:
inouts = (
("just-a-name", ("just-a-name", None)),
("udf/name @ 2022-03-04", ("udf/name", _utc(2022, 3, 4))),
("other/name@2022-03-04 05:06", ("other/name", _utc(2022, 3, 4, 5, 6))),
("prince@1999-09-09 21:21:21", ("prince", _utc(1999, 9, 9, 21, 21, 21))),
(
" uses-t @ 2024-09-17T20:59:59.999999 ",
("uses-t", _utc(2024, 9, 17, 20, 59, 59, 999999)),
),
("lowercase-t@2020-01-02t03:04", ("lowercase-t", _utc(2020, 1, 2, 3, 4))),
)
for inval, outs in inouts:
with self.subTest(inval):
self.assertEqual(outs, udf._parse_udf_name_timestamp(inval))

def test_parse_udf_name_timestamp_bad(self) -> None:
bads = (
"name@not a time at all",
"too-short@2020-01",
"no-space@2020-01-0203",
"hour-only@2020-01-02 03",
"too-precise@2020-01-02 03:04:05.67890123456",
)
for bad in bads:
with self.subTest(bad):
with self.assertRaises(ValueError):
udf._parse_udf_name_timestamp(bad)


def _utc(*args: int) -> datetime.datetime:
return datetime.datetime(*args, tzinfo=datetime.timezone.utc)

0 comments on commit 137cd14

Please sign in to comment.