-
Notifications
You must be signed in to change notification settings - Fork 104
/
lookup_cache.py
179 lines (147 loc) · 6.54 KB
/
lookup_cache.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
import hashlib
import logging
import platform
import re
from datetime import datetime
from pathlib import Path
from typing import Optional
import requests
from filelock import FileLock, Timeout
from platformdirs import user_cache_path
logger = logging.getLogger(__name__)
CLEAR_CACHE_AFTER_SECONDS = 60 * 60 * 24 * 2 # 2 days
"""Cached files older than this will be deleted."""
DONT_CHECK_IF_NEWER_THAN_SECONDS = 60 * 5 # 5 minutes
"""If the cached file is newer than this, just use it without checking for updates."""
WINDOWS_TIME_EPSILON = 0.005
"""Windows has issues with file timestamps, so we add this small offset
to ensure that newly created files have a positive age.
"""
def uncached_download_file(url: str) -> bytes:
"""The simple equivalent to cached_download_file."""
res = requests.get(url, headers={"User-Agent": "conda-lock"})
res.raise_for_status()
return res.content
def cached_download_file(
url: str,
*,
cache_subdir_name: str,
cache_root: Optional[Path] = None,
max_age_seconds: float = CLEAR_CACHE_AFTER_SECONDS,
dont_check_if_newer_than_seconds: float = DONT_CHECK_IF_NEWER_THAN_SECONDS,
) -> bytes:
"""Download a file and cache it in the user cache directory.
If the file is already cached, return the cached contents.
If the file is not cached, download it and cache the contents
and the ETag.
Protect against multiple processes downloading the same file.
"""
if cache_root is None:
cache_root = user_cache_path("conda-lock", appauthor=False)
cache = cache_root / "cache" / cache_subdir_name
cache.mkdir(parents=True, exist_ok=True)
clear_old_files_from_cache(cache, max_age_seconds=max_age_seconds)
destination_lock = (cache / cached_filename_for_url(url)).with_suffix(".lock")
# Wait for any other process to finish downloading the file.
# This way we can use the result from the current download without
# spawning multiple concurrent downloads.
while True:
try:
with FileLock(str(destination_lock), timeout=5):
return _download_to_or_read_from_cache(
url,
cache=cache,
dont_check_if_newer_than_seconds=dont_check_if_newer_than_seconds,
)
except Timeout:
logger.warning(
f"Failed to acquire lock on {destination_lock}, it is likely "
f"being downloaded by another process. Retrying..."
)
def _download_to_or_read_from_cache(
url: str, *, cache: Path, dont_check_if_newer_than_seconds: float
) -> bytes:
"""Download a file to the cache directory and return the contents.
This function is designed to be called from within a FileLock context to avoid
multiple processes downloading the same file.
If the file is newer than `dont_check_if_newer_than_seconds`, then immediately
return the cached contents. Otherwise we pass the ETag from the last download
in the headers to avoid downloading the file if it hasn't changed remotely.
"""
destination = cache / cached_filename_for_url(url)
destination_etag = destination.with_suffix(".etag")
request_headers = {"User-Agent": "conda-lock"}
# Return the contents immediately if the file is fresh
if destination.is_file():
age_seconds = get_age_seconds(destination)
if 0 <= age_seconds < dont_check_if_newer_than_seconds:
logger.debug(
f"Using cached mapping {destination} of age {age_seconds}s "
f"without checking for updates"
)
return destination.read_bytes()
# Add the ETag from the last download, if it exists, to the headers.
# The ETag is used to avoid downloading the file if it hasn't changed remotely.
# Otherwise, download the file and cache the contents and ETag.
if destination_etag.is_file():
old_etag = destination_etag.read_text().strip()
request_headers["If-None-Match"] = old_etag
# Download the file and cache the result.
logger.debug(f"Requesting {url}")
res = requests.get(url, headers=request_headers)
if res.status_code == 304:
logger.debug(f"{url} has not changed since last download, using {destination}")
else:
res.raise_for_status()
destination.write_bytes(res.content)
if "ETag" in res.headers:
destination_etag.write_text(res.headers["ETag"])
else:
logger.warning("No ETag in response headers")
logger.debug(f"Downloaded {url} to {destination}")
return destination.read_bytes()
def cached_filename_for_url(url: str) -> str:
"""Return a filename for a URL that is probably unique to the URL.
The filename is a 4-character hash of the URL, followed by the extension.
If the extension is not alphanumeric or too long, it is omitted.
>>> cached_filename_for_url("https://example.com/foo.json")
'a5d7.json'
>>> cached_filename_for_url("https://example.com/foo")
'5ea6'
>>> cached_filename_for_url("https://example.com/foo.bär")
'2191'
>>> cached_filename_for_url("https://example.com/foo.baaaaaar")
'1861'
"""
url_hash = hashlib.sha256(url.encode()).hexdigest()[:4]
extension = url.split(".")[-1]
if len(extension) <= 6 and re.match("^[a-zA-Z0-9]+$", extension):
return f"{url_hash}.{extension}"
else:
return f"{url_hash}"
def clear_old_files_from_cache(cache: Path, *, max_age_seconds: float) -> None:
"""Remove files in the cache directory older than `max_age_seconds`.
Also removes any files that somehow have a modification time in the future.
For safety, this raises an error if `cache` is not a subdirectory of
a directory named `"cache"`.
"""
if not cache.parent.name == "cache":
raise ValueError(
f"Expected cache directory, got {cache}. Parent should be 'cache' ",
f"not '{cache.parent.name}'",
)
for file in cache.iterdir():
age_seconds = get_age_seconds(file)
if age_seconds < 0 or age_seconds >= max_age_seconds:
logger.debug(f"Removing old cache file {file} of age {age_seconds}s")
file.unlink()
def get_age_seconds(path: Path) -> float:
"""Return the age of a file in seconds.
On Windows, the age of a new file is sometimes slightly negative, so we add a small
offset to ensure that the age is positive.
"""
raw_age = datetime.now().timestamp() - path.stat().st_mtime
if platform.system() == "Windows":
return raw_age + WINDOWS_TIME_EPSILON
else:
return raw_age