From 5b254e4dfcc234d9ac70200ec096ac89c1ce8b41 Mon Sep 17 00:00:00 2001 From: Eugenio Lacuesta <1731933+elacuesta@users.noreply.github.com> Date: Tue, 28 Nov 2023 23:18:08 -0300 Subject: [PATCH] Configuration class (#248) * Config class * Do not delete context kwargs after init * Fix attribute path --- scrapy_playwright/handler.py | 113 ++++++++++-------- tests/tests_asyncio/test_browser_contexts.py | 21 ---- .../tests_asyncio/test_playwright_requests.py | 35 ------ tests/tests_asyncio/test_settings.py | 39 ++++++ tests/tests_asyncio/test_utils.py | 1 + 5 files changed, 102 insertions(+), 107 deletions(-) create mode 100644 tests/tests_asyncio/test_settings.py diff --git a/scrapy_playwright/handler.py b/scrapy_playwright/handler.py index 8b24129..91ded95 100644 --- a/scrapy_playwright/handler.py +++ b/scrapy_playwright/handler.py @@ -23,6 +23,7 @@ from scrapy.http import Request, Response from scrapy.http.headers import Headers from scrapy.responsetypes import responsetypes +from scrapy.settings import Settings from scrapy.utils.defer import deferred_from_coro from scrapy.utils.misc import load_object from scrapy.utils.reactor import verify_installed_reactor @@ -60,57 +61,68 @@ class BrowserContextWrapper: persistent: bool +@dataclass +class Config: + cdp_url: Optional[str] + cdp_kwargs: dict + browser_type_name: str + launch_options: dict + max_pages_per_context: int + max_contexts: Optional[int] + startup_context_kwargs: dict + navigation_timeout: Optional[float] = None + + @classmethod + def from_settings(cls, settings: Settings) -> "Config": + cfg = cls( + cdp_url=settings.get("PLAYWRIGHT_CDP_URL"), + cdp_kwargs=settings.getdict("PLAYWRIGHT_CDP_KWARGS") or {}, + browser_type_name=settings.get("PLAYWRIGHT_BROWSER_TYPE") or DEFAULT_BROWSER_TYPE, + launch_options=settings.getdict("PLAYWRIGHT_LAUNCH_OPTIONS") or {}, + max_pages_per_context=settings.getint("PLAYWRIGHT_MAX_PAGES_PER_CONTEXT"), + max_contexts=settings.getint("PLAYWRIGHT_MAX_CONTEXTS") or None, + startup_context_kwargs=settings.getdict("PLAYWRIGHT_CONTEXTS"), + ) + cfg.cdp_kwargs.pop("endpoint_url", None) + if not cfg.max_pages_per_context: + cfg.max_pages_per_context = settings.getint("CONCURRENT_REQUESTS") + if cfg.cdp_url and cfg.launch_options: + logger.warning("PLAYWRIGHT_CDP_URL is set, ignoring PLAYWRIGHT_LAUNCH_OPTIONS") + if "PLAYWRIGHT_DEFAULT_NAVIGATION_TIMEOUT" in settings: + with suppress(TypeError, ValueError): + cfg.navigation_timeout = float(settings["PLAYWRIGHT_DEFAULT_NAVIGATION_TIMEOUT"]) + return cfg + + class ScrapyPlaywrightDownloadHandler(HTTPDownloadHandler): def __init__(self, crawler: Crawler) -> None: - settings = crawler.settings - super().__init__(settings=settings, crawler=crawler) + super().__init__(settings=crawler.settings, crawler=crawler) verify_installed_reactor("twisted.internet.asyncioreactor.AsyncioSelectorReactor") crawler.signals.connect(self._engine_started, signals.engine_started) self.stats = crawler.stats - # browser - self.browser_cdp_url = settings.get("PLAYWRIGHT_CDP_URL") - self.browser_cdp_kwargs = settings.get("PLAYWRIGHT_CDP_KWARGS") or {} - self.browser_cdp_kwargs.pop("endpoint_url", None) - self.browser_type_name = settings.get("PLAYWRIGHT_BROWSER_TYPE") or DEFAULT_BROWSER_TYPE - self.browser_launch_lock = asyncio.Lock() - self.launch_options: dict = settings.getdict("PLAYWRIGHT_LAUNCH_OPTIONS") or {} - if self.browser_cdp_url and self.launch_options: - logger.warning("PLAYWRIGHT_CDP_URL is set, ignoring PLAYWRIGHT_LAUNCH_OPTIONS") + self.config = Config.from_settings(crawler.settings) - # contexts - self.max_pages_per_context: int = settings.getint( - "PLAYWRIGHT_MAX_PAGES_PER_CONTEXT" - ) or settings.getint("CONCURRENT_REQUESTS") + self.browser_launch_lock = asyncio.Lock() self.context_launch_lock = asyncio.Lock() self.context_wrappers: Dict[str, BrowserContextWrapper] = {} - self.startup_context_kwargs: dict = settings.getdict("PLAYWRIGHT_CONTEXTS") - if settings.getint("PLAYWRIGHT_MAX_CONTEXTS"): - self.context_semaphore = asyncio.Semaphore( - value=settings.getint("PLAYWRIGHT_MAX_CONTEXTS") - ) - - self.default_navigation_timeout: Optional[float] = None - if "PLAYWRIGHT_DEFAULT_NAVIGATION_TIMEOUT" in settings: - with suppress(TypeError, ValueError): - self.default_navigation_timeout = float( - settings.get("PLAYWRIGHT_DEFAULT_NAVIGATION_TIMEOUT") - ) + if self.config.max_contexts: + self.context_semaphore = asyncio.Semaphore(value=self.config.max_contexts) # headers - if "PLAYWRIGHT_PROCESS_REQUEST_HEADERS" in settings: - if settings["PLAYWRIGHT_PROCESS_REQUEST_HEADERS"] is None: + if "PLAYWRIGHT_PROCESS_REQUEST_HEADERS" in crawler.settings: + if crawler.settings["PLAYWRIGHT_PROCESS_REQUEST_HEADERS"] is None: self.process_request_headers = None else: self.process_request_headers = load_object( - settings["PLAYWRIGHT_PROCESS_REQUEST_HEADERS"] + crawler.settings["PLAYWRIGHT_PROCESS_REQUEST_HEADERS"] ) else: self.process_request_headers = use_scrapy_headers self.abort_request: Optional[Callable[[PlaywrightRequest], Union[Awaitable, bool]]] = None - if settings.get("PLAYWRIGHT_ABORT_REQUEST"): - self.abort_request = load_object(settings["PLAYWRIGHT_ABORT_REQUEST"]) + if crawler.settings.get("PLAYWRIGHT_ABORT_REQUEST"): + self.abort_request = load_object(crawler.settings["PLAYWRIGHT_ABORT_REQUEST"]) @classmethod def from_crawler(cls: Type[PlaywrightHandler], crawler: Crawler) -> PlaywrightHandler: @@ -125,35 +137,34 @@ async def _launch(self) -> None: logger.info("Starting download handler") self.playwright_context_manager = PlaywrightContextManager() self.playwright = await self.playwright_context_manager.start() - self.browser_type: BrowserType = getattr(self.playwright, self.browser_type_name) - if self.startup_context_kwargs: - logger.info("Launching %i startup context(s)", len(self.startup_context_kwargs)) + self.browser_type: BrowserType = getattr(self.playwright, self.config.browser_type_name) + if self.config.startup_context_kwargs: + logger.info("Launching %i startup context(s)", len(self.config.startup_context_kwargs)) await asyncio.gather( *[ self._create_browser_context(name=name, context_kwargs=kwargs) - for name, kwargs in self.startup_context_kwargs.items() + for name, kwargs in self.config.startup_context_kwargs.items() ] ) self._set_max_concurrent_context_count() logger.info("Startup context(s) launched") self.stats.set_value("playwright/page_count", self._get_total_page_count()) - del self.startup_context_kwargs async def _maybe_launch_browser(self) -> None: async with self.browser_launch_lock: if not hasattr(self, "browser"): logger.info("Launching browser %s", self.browser_type.name) - self.browser = await self.browser_type.launch(**self.launch_options) + self.browser = await self.browser_type.launch(**self.config.launch_options) logger.info("Browser %s launched", self.browser_type.name) async def _maybe_connect_devtools(self) -> None: async with self.browser_launch_lock: if not hasattr(self, "browser"): - logger.info("Connecting using CDP: %s", self.browser_cdp_url) + logger.info("Connecting using CDP: %s", self.config.cdp_url) self.browser = await self.browser_type.connect_over_cdp( - self.browser_cdp_url, **self.browser_cdp_kwargs + self.config.cdp_url, **self.config.cdp_kwargs ) - logger.info("Connected using CDP: %s", self.browser_cdp_url) + logger.info("Connected using CDP: %s", self.config.cdp_url) async def _create_browser_context( self, @@ -171,7 +182,7 @@ async def _create_browser_context( context = await self.browser_type.launch_persistent_context(**context_kwargs) persistent = True remote = False - elif self.browser_cdp_url: + elif self.config.cdp_url: await self._maybe_connect_devtools() context = await self.browser.new_context(**context_kwargs) persistent = False @@ -200,11 +211,11 @@ async def _create_browser_context( "remote": remote, }, ) - if self.default_navigation_timeout is not None: - context.set_default_navigation_timeout(self.default_navigation_timeout) + if self.config.navigation_timeout is not None: + context.set_default_navigation_timeout(self.config.navigation_timeout) self.context_wrappers[name] = BrowserContextWrapper( context=context, - semaphore=asyncio.Semaphore(value=self.max_pages_per_context), + semaphore=asyncio.Semaphore(value=self.config.max_pages_per_context), persistent=persistent, ) self._set_max_concurrent_context_count() @@ -243,8 +254,8 @@ async def _create_page(self, request: Request, spider: Spider) -> Page: }, ) self._set_max_concurrent_page_count() - if self.default_navigation_timeout is not None: - page.set_default_navigation_timeout(self.default_navigation_timeout) + if self.config.navigation_timeout is not None: + page.set_default_navigation_timeout(self.config.navigation_timeout) page.on("close", self._make_close_page_callback(context_name)) page.on("crash", self._make_close_page_callback(context_name)) @@ -444,9 +455,9 @@ async def _handle_download(dwnld: Download) -> None: response = await page.goto(url=request.url, **page_goto_kwargs) except PlaywrightError as err: if not ( - self.browser_type_name in ("firefox", "webkit") + self.config.browser_type_name in ("firefox", "webkit") and "Download is starting" in err.message - or self.browser_type_name == "chromium" + or self.config.browser_type_name == "chromium" and "net::ERR_ABORTED" in err.message ): raise @@ -480,7 +491,7 @@ async def _apply_page_methods(self, page: Page, request: Request, spider: Spider ) else: pm.result = await _maybe_await(method(*pm.args, **pm.kwargs)) - await page.wait_for_load_state(timeout=self.default_navigation_timeout) + await page.wait_for_load_state(timeout=self.config.navigation_timeout) else: logger.warning( "Ignoring %r: expected PageMethod, got %r", @@ -577,7 +588,7 @@ async def _request_handler(route: Route, playwright_request: PlaywrightRequest) else: overrides["headers"] = final_headers = await _maybe_await( self.process_request_headers( - self.browser_type_name, playwright_request, headers + self.config.browser_type_name, playwright_request, headers ) ) diff --git a/tests/tests_asyncio/test_browser_contexts.py b/tests/tests_asyncio/test_browser_contexts.py index e7d0cb4..b75dacf 100644 --- a/tests/tests_asyncio/test_browser_contexts.py +++ b/tests/tests_asyncio/test_browser_contexts.py @@ -15,27 +15,6 @@ class MixinTestCaseMultipleContexts: - @pytest.mark.asyncio - async def test_contexts_max_settings(self): - settings = { - "PLAYWRIGHT_BROWSER_TYPE": self.browser_type, - "PLAYWRIGHT_MAX_PAGES_PER_CONTEXT": 1234, - } - async with make_handler(settings) as handler: - assert handler.max_pages_per_context == 1234 - - settings = {"PLAYWRIGHT_BROWSER_TYPE": self.browser_type, "CONCURRENT_REQUESTS": 9876} - async with make_handler(settings) as handler: - assert handler.max_pages_per_context == 9876 - - settings = {"PLAYWRIGHT_BROWSER_TYPE": self.browser_type, "PLAYWRIGHT_MAX_CONTEXTS": None} - async with make_handler(settings) as handler: - assert not hasattr(handler, "context_semaphore") - - settings = {"PLAYWRIGHT_BROWSER_TYPE": self.browser_type, "PLAYWRIGHT_MAX_CONTEXTS": 1234} - async with make_handler(settings) as handler: - assert handler.context_semaphore._value == 1234 - @pytest.mark.asyncio async def test_context_kwargs(self): settings_dict = { diff --git a/tests/tests_asyncio/test_playwright_requests.py b/tests/tests_asyncio/test_playwright_requests.py index a94b50e..d56d1ef 100644 --- a/tests/tests_asyncio/test_playwright_requests.py +++ b/tests/tests_asyncio/test_playwright_requests.py @@ -68,41 +68,6 @@ async def test_post_request(self): assert_correct_response(resp, req) assert "Request body: foo=bar" in resp.text - @pytest.mark.asyncio - async def test_timeout_value(self): - settings_dict = { - "PLAYWRIGHT_BROWSER_TYPE": self.browser_type, - } - async with make_handler(settings_dict) as handler: - assert handler.default_navigation_timeout is None - - settings_dict = { - "PLAYWRIGHT_BROWSER_TYPE": self.browser_type, - "PLAYWRIGHT_DEFAULT_NAVIGATION_TIMEOUT": None, - } - async with make_handler(settings_dict) as handler: - assert handler.default_navigation_timeout is None - - settings_dict = { - "PLAYWRIGHT_BROWSER_TYPE": self.browser_type, - "PLAYWRIGHT_DEFAULT_NAVIGATION_TIMEOUT": 0, - } - async with make_handler(settings_dict) as handler: - assert handler.default_navigation_timeout == 0 - - settings_dict = { - "PLAYWRIGHT_BROWSER_TYPE": self.browser_type, - "PLAYWRIGHT_DEFAULT_NAVIGATION_TIMEOUT": 123, - } - async with make_handler(settings_dict) as handler: - assert handler.default_navigation_timeout == 123 - settings_dict = { - "PLAYWRIGHT_BROWSER_TYPE": self.browser_type, - "PLAYWRIGHT_DEFAULT_NAVIGATION_TIMEOUT": 0.5, - } - async with make_handler(settings_dict) as handler: - assert handler.default_navigation_timeout == 0.5 - @pytest.mark.asyncio async def test_timeout_error(self): settings_dict = { diff --git a/tests/tests_asyncio/test_settings.py b/tests/tests_asyncio/test_settings.py new file mode 100644 index 0000000..42e7da5 --- /dev/null +++ b/tests/tests_asyncio/test_settings.py @@ -0,0 +1,39 @@ +from unittest import IsolatedAsyncioTestCase + +from scrapy.settings import Settings + +from scrapy_playwright.handler import Config + +from tests import make_handler + + +class TestSettings(IsolatedAsyncioTestCase): + async def test_settings_timeout_value(self): + config = Config.from_settings(Settings({})) + assert config.navigation_timeout is None + + config = Config.from_settings(Settings({"PLAYWRIGHT_DEFAULT_NAVIGATION_TIMEOUT": None})) + assert config.navigation_timeout is None + + config = Config.from_settings(Settings({"PLAYWRIGHT_DEFAULT_NAVIGATION_TIMEOUT": 0})) + assert config.navigation_timeout == 0 + + config = Config.from_settings(Settings({"PLAYWRIGHT_DEFAULT_NAVIGATION_TIMEOUT": 123})) + assert config.navigation_timeout == 123 + + config = Config.from_settings(Settings({"PLAYWRIGHT_DEFAULT_NAVIGATION_TIMEOUT": 0.5})) + assert config.navigation_timeout == 0.5 + + async def test_max_pages_per_context(self): + config = Config.from_settings(Settings({"PLAYWRIGHT_MAX_PAGES_PER_CONTEXT": 1234})) + assert config.max_pages_per_context == 1234 + + config = Config.from_settings(Settings({"CONCURRENT_REQUESTS": 9876})) + assert config.max_pages_per_context == 9876 + + async def test_max_contexts(self): + async with make_handler({"PLAYWRIGHT_MAX_CONTEXTS": None}) as handler: + assert not hasattr(handler, "context_semaphore") + + async with make_handler({"PLAYWRIGHT_MAX_CONTEXTS": 1234}) as handler: + assert handler.context_semaphore._value == 1234 diff --git a/tests/tests_asyncio/test_utils.py b/tests/tests_asyncio/test_utils.py index d353517..ddbb741 100644 --- a/tests/tests_asyncio/test_utils.py +++ b/tests/tests_asyncio/test_utils.py @@ -6,6 +6,7 @@ from playwright.async_api import Error as PlaywrightError from scrapy import Spider from scrapy.http.headers import Headers + from scrapy_playwright._utils import ( _NAVIGATION_ERROR_MSG, _encode_body,