Skip to content

Commit

Permalink
Add support for random replacement, fifo, and lfu caches
Browse files Browse the repository at this point in the history
While complicated to implement compared to just using OrderedDict, the
LFU has really impressive performances on the sample file, with hit
rates 10% higher than LRU in the mid end (100 to 1000 entries caches),
and a significant edge in performances despite the complexity of
dealing with multiple linked lists.
  • Loading branch information
masklinn committed Mar 2, 2024
1 parent 0367c3b commit 18cee5a
Show file tree
Hide file tree
Showing 3 changed files with 210 additions and 10 deletions.
25 changes: 18 additions & 7 deletions src/ua_parser/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
Parser,
PartialParseResult,
Resolver,
caching,
)
from .caching import Cache, Local
from .loaders import load_builtins, load_yaml
Expand Down Expand Up @@ -160,11 +161,15 @@ def get_parser(
if cache == "none":
return Parser(r).parse
elif cache == "clearing":
c = Clearing
c = caching.Clearing
elif cache == "random":
c = caching.Random
elif cache == "fifo":
c = caching.Fifo
elif cache == "lru":
c = LRU
elif cache == "lru-threadsafe":
c = lambda size: Locking(LRU(size)) # noqa: E731
c = caching.LRU
elif cache == "lfu":
c = caching.Lfu
else:
sys.exit(f"unknown cache algorithm {cache!r}")

Expand Down Expand Up @@ -206,7 +211,13 @@ def __call__(self, ua: str, domains: Domain, /) -> PartialParseResult:
print(total, "lines", uniques, "uniques")
print(f"ideal hit rate: {(total - uniques)/total:.0%}")
print()
caches: List[Callable[[int], Cache]] = [Clearing, LRU]
caches: List[Callable[[int], Cache]] = [
caching.Clearing,
caching.LRU,
caching.Random,
caching.Fifo,
caching.Lfu,
]
for cache, cache_size in itertools.product(
caches,
args.cachesizes,
Expand Down Expand Up @@ -367,8 +378,8 @@ def __call__(
bench.add_argument(
"--caches",
nargs="+",
choices=["none", "clearing", "lru", "lru-threadsafe"],
default=["none", "clearing", "lru", "lru-threadsafe"],
choices=["none", "clearing", "random", "fifo", "lru", "lfu"],
default=["none", "clearing", "random", "fifo", "lru", "lfu"],
help="""Cache implementations to test. `clearing` completely
clears the cache when full, `lru` uses a least-recently-eviction
policy. `lru` is not thread-safe, so `lru-threadsafe` adds a mutex
Expand Down
186 changes: 183 additions & 3 deletions src/ua_parser/caching.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,24 @@
from __future__ import annotations

import abc
import dataclasses
import random
import threading
from collections import OrderedDict
from contextvars import ContextVar
from typing import Callable, Dict, Optional, Protocol
from typing import Callable, Dict, List, Optional, Protocol, cast

from .core import Domain, PartialParseResult, Resolver

__all__ = [
"CachingResolver",
"Cache",
"CachingResolver",
"Clearing",
"Locking",
"Fifo",
"LRU",
"Lfu",
"Locking",
"Random",
]


Expand Down Expand Up @@ -104,6 +111,179 @@ def __setitem__(self, key: str, value: PartialParseResult) -> None:
self.cache.popitem(last=False)


class Random:
"""Random Replacement cache. Space is made before a new entry is
inserted, to ensure the new entry is not immediately popped.
Defaults to the standard library's default PRNG, but that can be
overridden.
Non thread-safe.
"""

def __init__(self, maxsize: int, rng: random.Random = random.Random()) -> None:
self.maxsize = maxsize
self.rng = rng
self.cache: Dict[str, PartialParseResult] = {}
self.entries: List[str] = []

def __getitem__(self, key: str) -> Optional[PartialParseResult]:
return self.cache.get(key)

def __setitem__(self, key: str, entry: PartialParseResult) -> None:
if len(self.cache) >= self.maxsize:
idx = self.rng.randrange(len(self.entries))
k = self.entries[idx]
del self.cache[k]

self.entries[idx] = key
self.cache[key] = entry
else:
self.entries.append(key)
self.cache[key] = entry


class Fifo:
"""First-in first-out cache.
Non thread-safe
"""

def __init__(self, maxsize: int) -> None:
self.maxsize = maxsize
self.cache: OrderedDict[str, PartialParseResult] = OrderedDict()

def __getitem__(self, key: str) -> Optional[PartialParseResult]:
return self.cache.get(key)

def __setitem__(self, key: str, entry: PartialParseResult) -> None:
self.cache[key] = entry
while len(self.cache) > self.maxsize:
self.cache.popitem(last=False)


@dataclasses.dataclass
class LfuNode:
__slots__ = ("prev", "next", "freq", "first", "last")
prev: Optional[LfuNode]
next: Optional[LfuNode]
freq: int
first: LfuEntry
last: LfuEntry

def append(self, entry: LfuEntry) -> None:
entry.node = self
entry.prev = self.last
self.last.next = entry
self.last = entry


@dataclasses.dataclass
class LfuEntry:
__slots__ = ("node", "prev", "next", "key", "value")
node: LfuNode
prev: Optional[LfuEntry]
next: Optional[LfuEntry]
key: str
value: PartialParseResult

def pop(self) -> bool:
node = self.node
if self.prev and self.next: # middle of list
self.next.prev = self.prev
self.prev.next = self.next
elif self.next: # first entry
node.first = self.next
node.first.prev = None
elif self.prev: # last entry
node.last = self.prev
node.last.next = None
else: # sole entry
return True
return False


class Lfu:
"""Least frequently used cache.
Absolutely not even remotely thread-safe.
"""

def __init__(self, maxsize: int) -> None:
self.maxsize = maxsize
self.freqlist: Optional[LfuNode] = None
self.entries: Dict[str, LfuEntry] = {}

def __getitem__(self, key: str) -> Optional[PartialParseResult]:
e = self.entries.get(key)
if not e:
return None

self.bump_frequency(e)
return e.value

def bump_frequency(self, entry: LfuEntry) -> None:
node = entry.node
delete_node = entry.pop()

entry.prev = None
entry.next = None
if node.next and node.next.freq == node.freq + 1:
next_node = node.next
next_node.append(entry)
else:
old_next = node.next
next_node = entry.node = LfuNode(
freq=node.freq + 1,
first=entry,
last=entry,
prev=node,
next=node.next,
)
# insert new node in the linked list
node.next = next_node
if old_next:
old_next.prev = next_node

if delete_node:
if node.prev and node.next:
node.next.prev = node.prev
node.prev.next = node.next
elif node.next: # first entry
node.next.prev = None
self.freqlist = node.next
# we can ignore the last entry case (and sole entry
# sub-case) since `next_node` is necessarily after the
# current node

def __setitem__(self, key: str, entry: PartialParseResult) -> None:
# optimise: if the first node has freq 1 and a single entry,
# we're deleting the first node only to add it back,
# whereas we should swap in the new entry
if len(self.entries) >= self.maxsize:
assert self.freqlist
e = self.freqlist.first
del self.entries[e.key]
if e.pop():
self.freqlist = self.freqlist.next
if self.freqlist:
self.freqlist.prev = None

e = self.entries[key] = LfuEntry(
node=cast(LfuNode, None), key=key, value=entry, prev=None, next=None
)
if self.freqlist and self.freqlist.freq == 1:
self.freqlist.append(e)
else:
node = e.node = LfuNode(
freq=1, first=e, last=e, prev=None, next=self.freqlist
)
if self.freqlist:
self.freqlist.prev = node
self.freqlist = node


class Locking:
"""Locking cache decorator. Takes a non-thread-safe cache and
ensures retrieving and setting entries is protected by a mutex.
Expand Down
9 changes: 9 additions & 0 deletions tests/test_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,15 @@
),
id="lru",
),
pytest.param(
Parser(
caching.CachingResolver(
BasicResolver(load_builtins()),
caching.Lfu(10),
)
),
id="lfu",
),
]
try:
from ua_parser import re2
Expand Down

0 comments on commit 18cee5a

Please sign in to comment.