Skip to content

Commit

Permalink
feat: add caching for timezone offsets, significantly speeds up import
Browse files Browse the repository at this point in the history
this is different from pr scrapinghub#1181. it builds a cache at install time which
can be distributed.

closes scrapinghub#533
  • Loading branch information
tobymao committed Feb 10, 2025
1 parent 47acb88 commit 3b98ab4
Show file tree
Hide file tree
Showing 4 changed files with 61 additions and 4 deletions.
1 change: 1 addition & 0 deletions MANIFEST.in
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ include CONTRIBUTING.rst
include HISTORY.rst
include LICENSE
include README.rst
include dateparser/data/dateparser_tz_cache.pkl
include dateparser_data/settings.py
include requirements.txt

Expand Down
Binary file added dateparser/data/dateparser_tz_cache.pkl
Binary file not shown.
48 changes: 44 additions & 4 deletions dateparser/timezone_parser.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,8 @@
import os
import pickle
import zlib
from datetime import datetime, timedelta, timezone, tzinfo
from pathlib import Path

import regex as re

Expand Down Expand Up @@ -84,8 +88,44 @@ def get_local_tz_offset():
return offset


_search_regex_parts = []
_tz_offsets = list(build_tz_offsets(_search_regex_parts))
_search_regex = re.compile("|".join(_search_regex_parts))
_search_regex_ignorecase = re.compile("|".join(_search_regex_parts), re.IGNORECASE)
local_tz_offset = get_local_tz_offset()

_tz_offsets = None
_search_regex = None
_search_regex_ignorecase = None


def _load_offsets(cache_path, current_hash):
global _tz_offsets, _search_regex, _search_regex_ignorecase

try:
with open(cache_path, mode="rb") as file:
(
serialized_hash,
_tz_offsets,
_search_regex,
_search_regex_ignorecase,
) = pickle.load(file)
if current_hash == serialized_hash:
return
except (FileNotFoundError, ValueError, TypeError):
pass

_search_regex_parts = []
_tz_offsets = list(build_tz_offsets(_search_regex_parts))
_search_regex = re.compile("|".join(_search_regex_parts))
_search_regex_ignorecase = re.compile("|".join(_search_regex_parts), re.IGNORECASE)

with open(cache_path, mode="wb") as file:
pickle.dump(
(current_hash, _tz_offsets, _search_regex, _search_regex_ignorecase),
file,
)


CACHE_PATH = Path(__file__).parent.joinpath("data", "dateparser_tz_cache.pkl")

_load_offsets(
cache_path=CACHE_PATH,
current_hash=zlib.crc32(str(timezone_info_list).encode("utf-8")),
)
16 changes: 16 additions & 0 deletions setup.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,26 @@
import re
import subprocess

from setuptools import find_packages, setup
from setuptools.command import develop, install

__version__ = re.search(
r"__version__.*\s*=\s*[\"]([^\"]+)[\"]", open("dateparser/__init__.py").read()
).group(1)


class PostDevelop(develop.develop):
def run(self):
subprocess.call("python 3 dateparser_scripts/timezones.py", shell=True)
develop.develop.run(self)


class PostInstall(install.install):
def run(self):
subprocess.call("python3 dateparser_scripts/timezones.py", shell=True)
install.install.run(self)


introduction = re.sub(
r":members:.+|..\sautomodule::.+|:class:|:func:|:ref:",
"",
Expand Down Expand Up @@ -45,6 +60,7 @@
"fasttext": ["fasttext"],
"langdetect": ["langdetect"],
},
cmdclass={"develop": PostDevelop, "install": PostInstall},
license="BSD",
zip_safe=False,
keywords="dateparser",
Expand Down

0 comments on commit 3b98ab4

Please sign in to comment.