diff --git a/pyproject.toml b/pyproject.toml index b42d432..9acef8e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -9,7 +9,7 @@ version = "1.0.0a1" readme = "README.rst" requires-python = ">=3.8" dependencies = [] -optional-dependencies = { yaml = ["PyYaml"] } +optional-dependencies = { yaml = ["PyYaml"], re2 = ["google-re2"] } license = {text = "Apache 2.0"} urls = {repository = "https://github.com/ua-parser/uap-python"} diff --git a/src/ua_parser/re2.py b/src/ua_parser/re2.py new file mode 100644 index 0000000..07a2918 --- /dev/null +++ b/src/ua_parser/re2.py @@ -0,0 +1,75 @@ +from __future__ import annotations + +import io +import os +import re +from typing import List, Tuple, Union + +import re2 # type: ignore + +from .core import ( + Parser as AbstractParser, + PartialParseResult, + Device, + Domain, + OS, + UserAgent, + Matchers, + UserAgentMatcher, + OSMatcher, + DeviceMatcher, +) + + +class Parser(AbstractParser): + ua: re2.Filter + user_agent_parsers: List[UserAgentMatcher] + os: re2.Filter + os_parsers: List[OSMatcher] + devices: re2.Filter + device_parsers: List[DeviceMatcher] + + def __init__( + self, + matchers: Matchers, + ) -> None: + self.user_agent_parsers, self.os_parsers, self.device_parsers = matchers + + self.ua = re2.Filter() + for u in self.user_agent_parsers: + self.ua.Add(u.regex.pattern) + self.ua.Compile() + + self.os = re2.Filter() + for o in self.os_parsers: + self.os.Add(o.regex.pattern) + self.os.Compile() + + self.devices = re2.Filter() + for d in self.device_parsers: + # Prepend the i global flag if IGNORECASE is set. Assumes + # no pattern uses global flags, but since they're not + # supported in JS that seems safe. + if d.regex.flags & re.IGNORECASE: + self.devices.Add("(?i)" + d.regex.pattern) + else: + self.devices.Add(d.regex.pattern) + self.devices.Compile() + + def __call__(self, ua: str, domains: Domain, /) -> PartialParseResult: + user_agent = os = device = None + if Domain.USER_AGENT in domains: + if matches := self.ua.Match(ua): + # Set/Filter does not return the match in index order + # (position order?) so to fit UAP semantics we need to + # extract the first matching regex (lowest index). + user_agent = self.user_agent_parsers[min(matches)](ua) + if Domain.OS in domains: + if matches := self.os.Match(ua): + os = self.os_parsers[min(matches)](ua) + if Domain.DEVICE in domains: + if matches := self.devices.Match(ua): + device = self.device_parsers[min(matches)](ua) + return PartialParseResult( + domains=domains, string=ua, user_agent=user_agent, os=os, device=device + ) diff --git a/tests/test_core.py b/tests/test_core.py index af03667..924c33e 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -54,7 +54,10 @@ id="lru", ), ] +with contextlib.suppress(ImportError): + from ua_parser import re2 + PARSERS.append(pytest.param(re2.Parser(load_builtins()), id="re2")) UA_FIELDS = {f.name for f in dataclasses.fields(UserAgent)} diff --git a/tox.ini b/tox.ini index 36ac52d..5e6cc01 100644 --- a/tox.ini +++ b/tox.ini @@ -19,9 +19,15 @@ wheel_build_env = .pkg deps = pytest pyyaml + google-re2 commands = pytest -Werror --doctest-glob="*.rst" {posargs} +[testenv:pypy3.{8,9,10},py312] +deps = + pytest + pyyaml + [testenv:flake8] package = skip deps = flake8