Add support for random replacement, fifo, and lfu caches

While complicated to implement compared to just using OrderedDict, the LFU has really impressive performances on the sample file, with hit rates 10% higher than LRU in the mid end (100 to 1000 entries caches), and a significant edge in performances despite the complexity of dealing with multiple linked lists.
ua-parser · Mar 2, 2024 · 18cee5a · 18cee5a
1 parent 0367c3b
commit 18cee5a
Show file tree

Hide file tree

Showing 3 changed files with 210 additions and 10 deletions.
diff --git a/src/ua_parser/__main__.py b/src/ua_parser/__main__.py
@@ -21,6 +21,7 @@
     Parser,
     PartialParseResult,
     Resolver,
+    caching,
 )
 from .caching import Cache, Local
 from .loaders import load_builtins, load_yaml
@@ -160,11 +161,15 @@ def get_parser(
     if cache == "none":
         return Parser(r).parse
     elif cache == "clearing":
-        c = Clearing
+        c = caching.Clearing
+    elif cache == "random":
+        c = caching.Random
+    elif cache == "fifo":
+        c = caching.Fifo
     elif cache == "lru":
-        c = LRU
-    elif cache == "lru-threadsafe":
-        c = lambda size: Locking(LRU(size))  # noqa: E731
+        c = caching.LRU
+    elif cache == "lfu":
+        c = caching.Lfu
     else:
         sys.exit(f"unknown cache algorithm {cache!r}")
 
@@ -206,7 +211,13 @@ def __call__(self, ua: str, domains: Domain, /) -> PartialParseResult:
     print(total, "lines", uniques, "uniques")
     print(f"ideal hit rate: {(total - uniques)/total:.0%}")
     print()
-    caches: List[Callable[[int], Cache]] = [Clearing, LRU]
+    caches: List[Callable[[int], Cache]] = [
+        caching.Clearing,
+        caching.LRU,
+        caching.Random,
+        caching.Fifo,
+        caching.Lfu,
+    ]
     for cache, cache_size in itertools.product(
         caches,
         args.cachesizes,
@@ -367,8 +378,8 @@ def __call__(
 bench.add_argument(
     "--caches",
     nargs="+",
-    choices=["none", "clearing", "lru", "lru-threadsafe"],
-    default=["none", "clearing", "lru", "lru-threadsafe"],
+    choices=["none", "clearing", "random", "fifo", "lru", "lfu"],
+    default=["none", "clearing", "random", "fifo", "lru", "lfu"],
     help="""Cache implementations to test. `clearing` completely
     clears the cache when full, `lru` uses a least-recently-eviction
     policy. `lru` is not thread-safe, so `lru-threadsafe` adds a mutex

diff --git a/src/ua_parser/caching.py b/src/ua_parser/caching.py
@@ -1,17 +1,24 @@
+from __future__ import annotations
+
 import abc
+import dataclasses
+import random
 import threading
 from collections import OrderedDict
 from contextvars import ContextVar
-from typing import Callable, Dict, Optional, Protocol
+from typing import Callable, Dict, List, Optional, Protocol, cast
 
 from .core import Domain, PartialParseResult, Resolver
 
 __all__ = [
-    "CachingResolver",
     "Cache",
+    "CachingResolver",
     "Clearing",
-    "Locking",
+    "Fifo",
     "LRU",
+    "Lfu",
+    "Locking",
+    "Random",
 ]
 
 
@@ -104,6 +111,179 @@ def __setitem__(self, key: str, value: PartialParseResult) -> None:
             self.cache.popitem(last=False)
 
 
+class Random:
+    """Random Replacement cache. Space is made before a new entry is
+    inserted, to ensure the new entry is not immediately popped.
+    Defaults to the standard library's default PRNG, but that can be
+    overridden.
+
+    Non thread-safe.
+
+    """
+
+    def __init__(self, maxsize: int, rng: random.Random = random.Random()) -> None:
+        self.maxsize = maxsize
+        self.rng = rng
+        self.cache: Dict[str, PartialParseResult] = {}
+        self.entries: List[str] = []
+
+    def __getitem__(self, key: str) -> Optional[PartialParseResult]:
+        return self.cache.get(key)
+
+    def __setitem__(self, key: str, entry: PartialParseResult) -> None:
+        if len(self.cache) >= self.maxsize:
+            idx = self.rng.randrange(len(self.entries))
+            k = self.entries[idx]
+            del self.cache[k]
+
+            self.entries[idx] = key
+            self.cache[key] = entry
+        else:
+            self.entries.append(key)
+            self.cache[key] = entry
+
+
+class Fifo:
+    """First-in first-out cache.
+
+    Non thread-safe
+    """
+
+    def __init__(self, maxsize: int) -> None:
+        self.maxsize = maxsize
+        self.cache: OrderedDict[str, PartialParseResult] = OrderedDict()
+
+    def __getitem__(self, key: str) -> Optional[PartialParseResult]:
+        return self.cache.get(key)
+
+    def __setitem__(self, key: str, entry: PartialParseResult) -> None:
+        self.cache[key] = entry
+        while len(self.cache) > self.maxsize:
+            self.cache.popitem(last=False)
+
+
+@dataclasses.dataclass
+class LfuNode:
+    __slots__ = ("prev", "next", "freq", "first", "last")
+    prev: Optional[LfuNode]
+    next: Optional[LfuNode]
+    freq: int
+    first: LfuEntry
+    last: LfuEntry
+
+    def append(self, entry: LfuEntry) -> None:
+        entry.node = self
+        entry.prev = self.last
+        self.last.next = entry
+        self.last = entry
+
+
+@dataclasses.dataclass
+class LfuEntry:
+    __slots__ = ("node", "prev", "next", "key", "value")
+    node: LfuNode
+    prev: Optional[LfuEntry]
+    next: Optional[LfuEntry]
+    key: str
+    value: PartialParseResult
+
+    def pop(self) -> bool:
+        node = self.node
+        if self.prev and self.next:  # middle of list
+            self.next.prev = self.prev
+            self.prev.next = self.next
+        elif self.next:  # first entry
+            node.first = self.next
+            node.first.prev = None
+        elif self.prev:  # last entry
+            node.last = self.prev
+            node.last.next = None
+        else:  # sole entry
+            return True
+        return False
+
+
+class Lfu:
+    """Least frequently used cache.
+
+    Absolutely not even remotely thread-safe.
+
+    """
+
+    def __init__(self, maxsize: int) -> None:
+        self.maxsize = maxsize
+        self.freqlist: Optional[LfuNode] = None
+        self.entries: Dict[str, LfuEntry] = {}
+
+    def __getitem__(self, key: str) -> Optional[PartialParseResult]:
+        e = self.entries.get(key)
+        if not e:
+            return None
+
+        self.bump_frequency(e)
+        return e.value
+
+    def bump_frequency(self, entry: LfuEntry) -> None:
+        node = entry.node
+        delete_node = entry.pop()
+
+        entry.prev = None
+        entry.next = None
+        if node.next and node.next.freq == node.freq + 1:
+            next_node = node.next
+            next_node.append(entry)
+        else:
+            old_next = node.next
+            next_node = entry.node = LfuNode(
+                freq=node.freq + 1,
+                first=entry,
+                last=entry,
+                prev=node,
+                next=node.next,
+            )
+            # insert new node in the linked list
+            node.next = next_node
+            if old_next:
+                old_next.prev = next_node
+
+        if delete_node:
+            if node.prev and node.next:
+                node.next.prev = node.prev
+                node.prev.next = node.next
+            elif node.next:  # first entry
+                node.next.prev = None
+                self.freqlist = node.next
+            # we can ignore the last entry case (and sole entry
+            # sub-case) since `next_node` is necessarily after the
+            # current node
+
+    def __setitem__(self, key: str, entry: PartialParseResult) -> None:
+        # optimise: if the first node has freq 1 and a single entry,
+        #           we're deleting the first node only to add it back,
+        #           whereas we should swap in the new entry
+        if len(self.entries) >= self.maxsize:
+            assert self.freqlist
+            e = self.freqlist.first
+            del self.entries[e.key]
+            if e.pop():
+                self.freqlist = self.freqlist.next
+                if self.freqlist:
+                    self.freqlist.prev = None
+
+        e = self.entries[key] = LfuEntry(
+            node=cast(LfuNode, None), key=key, value=entry, prev=None, next=None
+        )
+        if self.freqlist and self.freqlist.freq == 1:
+            self.freqlist.append(e)
+        else:
+            node = e.node = LfuNode(
+                freq=1, first=e, last=e, prev=None, next=self.freqlist
+            )
+            if self.freqlist:
+                self.freqlist.prev = node
+            self.freqlist = node
+
+
 class Locking:
     """Locking cache decorator. Takes a non-thread-safe cache and
     ensures retrieving and setting entries is protected by a mutex.

diff --git a/tests/test_core.py b/tests/test_core.py
@@ -60,6 +60,15 @@
         ),
         id="lru",
     ),
+    pytest.param(
+        Parser(
+            caching.CachingResolver(
+                BasicResolver(load_builtins()),
+                caching.Lfu(10),
+            )
+        ),
+        id="lfu",
+    ),
 ]
 try:
     from ua_parser import re2