From e719a7ef9b003e8bd901d55584d3d5c82ed6d4d3 Mon Sep 17 00:00:00 2001 From: masklinn Date: Thu, 2 Nov 2023 21:45:02 +0100 Subject: [PATCH] Add an re2-based parser Requires splitting out some of the testenvs, as re2 is not available for pypy at all, and not yet for 3.12. Uses `re2.Filter`, which unlike the C++ `FilteredRE2` bundles prefiltering, using an `re2.Set` so likely less efficient than providing one's own e.g. aho-corasick, but avoids having to do that. At first glance according to pytest's `--durations 0` this is quite successful (unlike using `re2.Set` which was more of a mixed bag): ``` 2.54s call tests/test_core.py::test_devices[test_device.yaml-basic] 2.51s call tests/test_core.py::test_ua[pgts_browser_list.yaml-basic] 2.48s call tests/test_legacy.py::TestParse::testPGTSStrings 2.43s call tests/test_legacy.py::TestParse::testStringsDevice 0.95s call tests/test_core.py::test_devices[test_device.yaml-re2] 0.55s call tests/test_core.py::test_ua[pgts_browser_list.yaml-re2] 0.18s call tests/test_core.py::test_ua[test_ua.yaml-basic] 0.16s call tests/test_legacy.py::TestParse::testBrowserscopeStrings 0.10s call tests/test_core.py::test_ua[test_ua.yaml-re2] ``` While the "basic" parser for the new API is slightly slower than the legacy API (browserscope does use test_ua.yaml so that matches) the re2 parser is significantly faster than both: - 60% faster on test_device.yaml (~2.5s -> 1s) - 80% faster on pgts (2.5s -> 0.5s) - 40% faster on browserscope (0.16 -> 0.1) This is very encouraging, altough the memory consumption has not been checked (yet). Fixes #149, kind-of --- pyproject.toml | 2 +- src/ua_parser/re2.py | 74 ++++++++++++++++++++++++++++++++++++++++++++ tests/test_core.py | 3 ++ tox.ini | 6 ++++ 4 files changed, 84 insertions(+), 1 deletion(-) create mode 100644 src/ua_parser/re2.py diff --git a/pyproject.toml b/pyproject.toml index 2be9ead..0f9f61e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -9,7 +9,7 @@ version = "1.0.0a1" readme = "README.rst" requires-python = ">=3.8" dependencies = [] -optional-dependencies = { yaml = ["PyYaml"] } +optional-dependencies = { yaml = ["PyYaml"], re2 = ["google-re2"] } license = {text = "Apache 2.0"} urls = {repository = "https://github.com/ua-parser/uap-python"} diff --git a/src/ua_parser/re2.py b/src/ua_parser/re2.py new file mode 100644 index 0000000..f9a92c4 --- /dev/null +++ b/src/ua_parser/re2.py @@ -0,0 +1,74 @@ +from __future__ import annotations + +import io +import os +import re +from typing import List, Tuple, Union + +import re2 # type: ignore + +from .core import ( + Parser as AbstractParser, + PartialParseResult, + Device, + Domain, + OS, + UserAgent, + UserAgentMatcher, + OSMatcher, + DeviceMatcher, +) + + +class Parser(AbstractParser): + ua: re2.Filter + user_agent_matchers: List[UserAgentMatcher] + os: re2.Filter + os_matchers: List[OSMatcher] + devices: re2.Filter + device_matchers: List[DeviceMatcher] + + def __init__( + self, + matchers: Tuple[List[UserAgentMatcher], List[OSMatcher], List[DeviceMatcher]], + ) -> None: + self.user_agent_matchers, self.os_matchers, self.device_matchers = matchers + + self.ua = re2.Filter() + for u in self.user_agent_matchers: + self.ua.Add(u.regex.pattern) + self.ua.Compile() + + self.os = re2.Filter() + for o in self.os_matchers: + self.os.Add(o.regex.pattern) + self.os.Compile() + + self.devices = re2.Filter() + for d in self.device_matchers: + # Prepend the i global flag if IGNORECASE is set. Assumes + # no pattern uses global flags, but since they're not + # supported in JS that seems safe. + if d.regex.flags & re.IGNORECASE: + self.devices.Add("(?i)" + d.regex.pattern) + else: + self.devices.Add(d.regex.pattern) + self.devices.Compile() + + def __call__(self, ua: str, domains: Domain, /) -> PartialParseResult: + user_agent = os = device = None + if Domain.USER_AGENT in domains: + if matches := self.ua.Match(ua): + # Set/Filter does not return the match in index order + # (position order?) so to fit UAP semantics we need to + # extract the first matching regex (lowest index). + user_agent = self.user_agent_matchers[min(matches)](ua) + if Domain.OS in domains: + if matches := self.os.Match(ua): + os = self.os_matchers[min(matches)](ua) + if Domain.DEVICE in domains: + if matches := self.devices.Match(ua): + device = self.device_matchers[min(matches)](ua) + return PartialParseResult( + domains=domains, string=ua, user_agent=user_agent, os=os, device=device + ) diff --git a/tests/test_core.py b/tests/test_core.py index b24f6e9..6abc06a 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -53,7 +53,10 @@ id="lru", ), ] +with contextlib.suppress(ImportError): + from ua_parser import re2 + PARSERS.append(pytest.param(re2.Parser(load_builtins()), id="re2")) UA_FIELDS = {f.name for f in dataclasses.fields(UserAgent)} diff --git a/tox.ini b/tox.ini index 6dac8a2..18126ce 100644 --- a/tox.ini +++ b/tox.ini @@ -19,9 +19,15 @@ wheel_build_env = .pkg deps = pytest pyyaml + google-re2 commands = pytest -Werror --doctest-glob="*.rst" {posargs} +[testenv:pypy3.{8,9,10},py312] +deps = + pytest + pyyaml + [testenv:flake8] package = skip deps = flake8