Skip to content

Commit

Permalink
Configuration class (#248)
Browse files Browse the repository at this point in the history
* Config class

* Do not delete context kwargs after init

* Fix attribute path
  • Loading branch information
elacuesta authored Nov 29, 2023
1 parent 1e7b2f6 commit 5b254e4
Show file tree
Hide file tree
Showing 5 changed files with 102 additions and 107 deletions.
113 changes: 62 additions & 51 deletions scrapy_playwright/handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
from scrapy.http import Request, Response
from scrapy.http.headers import Headers
from scrapy.responsetypes import responsetypes
from scrapy.settings import Settings
from scrapy.utils.defer import deferred_from_coro
from scrapy.utils.misc import load_object
from scrapy.utils.reactor import verify_installed_reactor
Expand Down Expand Up @@ -60,57 +61,68 @@ class BrowserContextWrapper:
persistent: bool


@dataclass
class Config:
cdp_url: Optional[str]
cdp_kwargs: dict
browser_type_name: str
launch_options: dict
max_pages_per_context: int
max_contexts: Optional[int]
startup_context_kwargs: dict
navigation_timeout: Optional[float] = None

@classmethod
def from_settings(cls, settings: Settings) -> "Config":
cfg = cls(
cdp_url=settings.get("PLAYWRIGHT_CDP_URL"),
cdp_kwargs=settings.getdict("PLAYWRIGHT_CDP_KWARGS") or {},
browser_type_name=settings.get("PLAYWRIGHT_BROWSER_TYPE") or DEFAULT_BROWSER_TYPE,
launch_options=settings.getdict("PLAYWRIGHT_LAUNCH_OPTIONS") or {},
max_pages_per_context=settings.getint("PLAYWRIGHT_MAX_PAGES_PER_CONTEXT"),
max_contexts=settings.getint("PLAYWRIGHT_MAX_CONTEXTS") or None,
startup_context_kwargs=settings.getdict("PLAYWRIGHT_CONTEXTS"),
)
cfg.cdp_kwargs.pop("endpoint_url", None)
if not cfg.max_pages_per_context:
cfg.max_pages_per_context = settings.getint("CONCURRENT_REQUESTS")
if cfg.cdp_url and cfg.launch_options:
logger.warning("PLAYWRIGHT_CDP_URL is set, ignoring PLAYWRIGHT_LAUNCH_OPTIONS")
if "PLAYWRIGHT_DEFAULT_NAVIGATION_TIMEOUT" in settings:
with suppress(TypeError, ValueError):
cfg.navigation_timeout = float(settings["PLAYWRIGHT_DEFAULT_NAVIGATION_TIMEOUT"])
return cfg


class ScrapyPlaywrightDownloadHandler(HTTPDownloadHandler):
def __init__(self, crawler: Crawler) -> None:
settings = crawler.settings
super().__init__(settings=settings, crawler=crawler)
super().__init__(settings=crawler.settings, crawler=crawler)
verify_installed_reactor("twisted.internet.asyncioreactor.AsyncioSelectorReactor")
crawler.signals.connect(self._engine_started, signals.engine_started)
self.stats = crawler.stats

# browser
self.browser_cdp_url = settings.get("PLAYWRIGHT_CDP_URL")
self.browser_cdp_kwargs = settings.get("PLAYWRIGHT_CDP_KWARGS") or {}
self.browser_cdp_kwargs.pop("endpoint_url", None)
self.browser_type_name = settings.get("PLAYWRIGHT_BROWSER_TYPE") or DEFAULT_BROWSER_TYPE
self.browser_launch_lock = asyncio.Lock()
self.launch_options: dict = settings.getdict("PLAYWRIGHT_LAUNCH_OPTIONS") or {}
if self.browser_cdp_url and self.launch_options:
logger.warning("PLAYWRIGHT_CDP_URL is set, ignoring PLAYWRIGHT_LAUNCH_OPTIONS")
self.config = Config.from_settings(crawler.settings)

# contexts
self.max_pages_per_context: int = settings.getint(
"PLAYWRIGHT_MAX_PAGES_PER_CONTEXT"
) or settings.getint("CONCURRENT_REQUESTS")
self.browser_launch_lock = asyncio.Lock()
self.context_launch_lock = asyncio.Lock()
self.context_wrappers: Dict[str, BrowserContextWrapper] = {}
self.startup_context_kwargs: dict = settings.getdict("PLAYWRIGHT_CONTEXTS")
if settings.getint("PLAYWRIGHT_MAX_CONTEXTS"):
self.context_semaphore = asyncio.Semaphore(
value=settings.getint("PLAYWRIGHT_MAX_CONTEXTS")
)

self.default_navigation_timeout: Optional[float] = None
if "PLAYWRIGHT_DEFAULT_NAVIGATION_TIMEOUT" in settings:
with suppress(TypeError, ValueError):
self.default_navigation_timeout = float(
settings.get("PLAYWRIGHT_DEFAULT_NAVIGATION_TIMEOUT")
)
if self.config.max_contexts:
self.context_semaphore = asyncio.Semaphore(value=self.config.max_contexts)

# headers
if "PLAYWRIGHT_PROCESS_REQUEST_HEADERS" in settings:
if settings["PLAYWRIGHT_PROCESS_REQUEST_HEADERS"] is None:
if "PLAYWRIGHT_PROCESS_REQUEST_HEADERS" in crawler.settings:
if crawler.settings["PLAYWRIGHT_PROCESS_REQUEST_HEADERS"] is None:
self.process_request_headers = None
else:
self.process_request_headers = load_object(
settings["PLAYWRIGHT_PROCESS_REQUEST_HEADERS"]
crawler.settings["PLAYWRIGHT_PROCESS_REQUEST_HEADERS"]
)
else:
self.process_request_headers = use_scrapy_headers

self.abort_request: Optional[Callable[[PlaywrightRequest], Union[Awaitable, bool]]] = None
if settings.get("PLAYWRIGHT_ABORT_REQUEST"):
self.abort_request = load_object(settings["PLAYWRIGHT_ABORT_REQUEST"])
if crawler.settings.get("PLAYWRIGHT_ABORT_REQUEST"):
self.abort_request = load_object(crawler.settings["PLAYWRIGHT_ABORT_REQUEST"])

@classmethod
def from_crawler(cls: Type[PlaywrightHandler], crawler: Crawler) -> PlaywrightHandler:
Expand All @@ -125,35 +137,34 @@ async def _launch(self) -> None:
logger.info("Starting download handler")
self.playwright_context_manager = PlaywrightContextManager()
self.playwright = await self.playwright_context_manager.start()
self.browser_type: BrowserType = getattr(self.playwright, self.browser_type_name)
if self.startup_context_kwargs:
logger.info("Launching %i startup context(s)", len(self.startup_context_kwargs))
self.browser_type: BrowserType = getattr(self.playwright, self.config.browser_type_name)
if self.config.startup_context_kwargs:
logger.info("Launching %i startup context(s)", len(self.config.startup_context_kwargs))
await asyncio.gather(
*[
self._create_browser_context(name=name, context_kwargs=kwargs)
for name, kwargs in self.startup_context_kwargs.items()
for name, kwargs in self.config.startup_context_kwargs.items()
]
)
self._set_max_concurrent_context_count()
logger.info("Startup context(s) launched")
self.stats.set_value("playwright/page_count", self._get_total_page_count())
del self.startup_context_kwargs

async def _maybe_launch_browser(self) -> None:
async with self.browser_launch_lock:
if not hasattr(self, "browser"):
logger.info("Launching browser %s", self.browser_type.name)
self.browser = await self.browser_type.launch(**self.launch_options)
self.browser = await self.browser_type.launch(**self.config.launch_options)
logger.info("Browser %s launched", self.browser_type.name)

async def _maybe_connect_devtools(self) -> None:
async with self.browser_launch_lock:
if not hasattr(self, "browser"):
logger.info("Connecting using CDP: %s", self.browser_cdp_url)
logger.info("Connecting using CDP: %s", self.config.cdp_url)
self.browser = await self.browser_type.connect_over_cdp(
self.browser_cdp_url, **self.browser_cdp_kwargs
self.config.cdp_url, **self.config.cdp_kwargs
)
logger.info("Connected using CDP: %s", self.browser_cdp_url)
logger.info("Connected using CDP: %s", self.config.cdp_url)

async def _create_browser_context(
self,
Expand All @@ -171,7 +182,7 @@ async def _create_browser_context(
context = await self.browser_type.launch_persistent_context(**context_kwargs)
persistent = True
remote = False
elif self.browser_cdp_url:
elif self.config.cdp_url:
await self._maybe_connect_devtools()
context = await self.browser.new_context(**context_kwargs)
persistent = False
Expand Down Expand Up @@ -200,11 +211,11 @@ async def _create_browser_context(
"remote": remote,
},
)
if self.default_navigation_timeout is not None:
context.set_default_navigation_timeout(self.default_navigation_timeout)
if self.config.navigation_timeout is not None:
context.set_default_navigation_timeout(self.config.navigation_timeout)
self.context_wrappers[name] = BrowserContextWrapper(
context=context,
semaphore=asyncio.Semaphore(value=self.max_pages_per_context),
semaphore=asyncio.Semaphore(value=self.config.max_pages_per_context),
persistent=persistent,
)
self._set_max_concurrent_context_count()
Expand Down Expand Up @@ -243,8 +254,8 @@ async def _create_page(self, request: Request, spider: Spider) -> Page:
},
)
self._set_max_concurrent_page_count()
if self.default_navigation_timeout is not None:
page.set_default_navigation_timeout(self.default_navigation_timeout)
if self.config.navigation_timeout is not None:
page.set_default_navigation_timeout(self.config.navigation_timeout)

page.on("close", self._make_close_page_callback(context_name))
page.on("crash", self._make_close_page_callback(context_name))
Expand Down Expand Up @@ -444,9 +455,9 @@ async def _handle_download(dwnld: Download) -> None:
response = await page.goto(url=request.url, **page_goto_kwargs)
except PlaywrightError as err:
if not (
self.browser_type_name in ("firefox", "webkit")
self.config.browser_type_name in ("firefox", "webkit")
and "Download is starting" in err.message
or self.browser_type_name == "chromium"
or self.config.browser_type_name == "chromium"
and "net::ERR_ABORTED" in err.message
):
raise
Expand Down Expand Up @@ -480,7 +491,7 @@ async def _apply_page_methods(self, page: Page, request: Request, spider: Spider
)
else:
pm.result = await _maybe_await(method(*pm.args, **pm.kwargs))
await page.wait_for_load_state(timeout=self.default_navigation_timeout)
await page.wait_for_load_state(timeout=self.config.navigation_timeout)
else:
logger.warning(
"Ignoring %r: expected PageMethod, got %r",
Expand Down Expand Up @@ -577,7 +588,7 @@ async def _request_handler(route: Route, playwright_request: PlaywrightRequest)
else:
overrides["headers"] = final_headers = await _maybe_await(
self.process_request_headers(
self.browser_type_name, playwright_request, headers
self.config.browser_type_name, playwright_request, headers
)
)

Expand Down
21 changes: 0 additions & 21 deletions tests/tests_asyncio/test_browser_contexts.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,27 +15,6 @@


class MixinTestCaseMultipleContexts:
@pytest.mark.asyncio
async def test_contexts_max_settings(self):
settings = {
"PLAYWRIGHT_BROWSER_TYPE": self.browser_type,
"PLAYWRIGHT_MAX_PAGES_PER_CONTEXT": 1234,
}
async with make_handler(settings) as handler:
assert handler.max_pages_per_context == 1234

settings = {"PLAYWRIGHT_BROWSER_TYPE": self.browser_type, "CONCURRENT_REQUESTS": 9876}
async with make_handler(settings) as handler:
assert handler.max_pages_per_context == 9876

settings = {"PLAYWRIGHT_BROWSER_TYPE": self.browser_type, "PLAYWRIGHT_MAX_CONTEXTS": None}
async with make_handler(settings) as handler:
assert not hasattr(handler, "context_semaphore")

settings = {"PLAYWRIGHT_BROWSER_TYPE": self.browser_type, "PLAYWRIGHT_MAX_CONTEXTS": 1234}
async with make_handler(settings) as handler:
assert handler.context_semaphore._value == 1234

@pytest.mark.asyncio
async def test_context_kwargs(self):
settings_dict = {
Expand Down
35 changes: 0 additions & 35 deletions tests/tests_asyncio/test_playwright_requests.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,41 +68,6 @@ async def test_post_request(self):
assert_correct_response(resp, req)
assert "Request body: foo=bar" in resp.text

@pytest.mark.asyncio
async def test_timeout_value(self):
settings_dict = {
"PLAYWRIGHT_BROWSER_TYPE": self.browser_type,
}
async with make_handler(settings_dict) as handler:
assert handler.default_navigation_timeout is None

settings_dict = {
"PLAYWRIGHT_BROWSER_TYPE": self.browser_type,
"PLAYWRIGHT_DEFAULT_NAVIGATION_TIMEOUT": None,
}
async with make_handler(settings_dict) as handler:
assert handler.default_navigation_timeout is None

settings_dict = {
"PLAYWRIGHT_BROWSER_TYPE": self.browser_type,
"PLAYWRIGHT_DEFAULT_NAVIGATION_TIMEOUT": 0,
}
async with make_handler(settings_dict) as handler:
assert handler.default_navigation_timeout == 0

settings_dict = {
"PLAYWRIGHT_BROWSER_TYPE": self.browser_type,
"PLAYWRIGHT_DEFAULT_NAVIGATION_TIMEOUT": 123,
}
async with make_handler(settings_dict) as handler:
assert handler.default_navigation_timeout == 123
settings_dict = {
"PLAYWRIGHT_BROWSER_TYPE": self.browser_type,
"PLAYWRIGHT_DEFAULT_NAVIGATION_TIMEOUT": 0.5,
}
async with make_handler(settings_dict) as handler:
assert handler.default_navigation_timeout == 0.5

@pytest.mark.asyncio
async def test_timeout_error(self):
settings_dict = {
Expand Down
39 changes: 39 additions & 0 deletions tests/tests_asyncio/test_settings.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
from unittest import IsolatedAsyncioTestCase

from scrapy.settings import Settings

from scrapy_playwright.handler import Config

from tests import make_handler


class TestSettings(IsolatedAsyncioTestCase):
async def test_settings_timeout_value(self):
config = Config.from_settings(Settings({}))
assert config.navigation_timeout is None

config = Config.from_settings(Settings({"PLAYWRIGHT_DEFAULT_NAVIGATION_TIMEOUT": None}))
assert config.navigation_timeout is None

config = Config.from_settings(Settings({"PLAYWRIGHT_DEFAULT_NAVIGATION_TIMEOUT": 0}))
assert config.navigation_timeout == 0

config = Config.from_settings(Settings({"PLAYWRIGHT_DEFAULT_NAVIGATION_TIMEOUT": 123}))
assert config.navigation_timeout == 123

config = Config.from_settings(Settings({"PLAYWRIGHT_DEFAULT_NAVIGATION_TIMEOUT": 0.5}))
assert config.navigation_timeout == 0.5

async def test_max_pages_per_context(self):
config = Config.from_settings(Settings({"PLAYWRIGHT_MAX_PAGES_PER_CONTEXT": 1234}))
assert config.max_pages_per_context == 1234

config = Config.from_settings(Settings({"CONCURRENT_REQUESTS": 9876}))
assert config.max_pages_per_context == 9876

async def test_max_contexts(self):
async with make_handler({"PLAYWRIGHT_MAX_CONTEXTS": None}) as handler:
assert not hasattr(handler, "context_semaphore")

async with make_handler({"PLAYWRIGHT_MAX_CONTEXTS": 1234}) as handler:
assert handler.context_semaphore._value == 1234
1 change: 1 addition & 0 deletions tests/tests_asyncio/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from playwright.async_api import Error as PlaywrightError
from scrapy import Spider
from scrapy.http.headers import Headers

from scrapy_playwright._utils import (
_NAVIGATION_ERROR_MSG,
_encode_body,
Expand Down

0 comments on commit 5b254e4

Please sign in to comment.