diff --git a/README.md b/README.md index ba868c2b..70d5ed2e 100644 --- a/README.md +++ b/README.md @@ -38,9 +38,9 @@ $ pip install scrapy-playwright Please see the [changelog.md](changelog.md) file. -## Configuration +## Activation -Replace the default `http` and `https` Download Handlers through +Replace the default `http` and/or `https` Download Handlers through [`DOWNLOAD_HANDLERS`](https://docs.scrapy.org/en/latest/topics/settings.html): ```python @@ -60,7 +60,7 @@ Also, be sure to [install the `asyncio`-based Twisted reactor](https://docs.scra TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor" ``` -### Settings +## Settings `scrapy-playwright` accepts the following settings: @@ -99,9 +99,10 @@ TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor" * `PLAYWRIGHT_PROCESS_REQUEST_HEADERS` (type `Union[Callable, str]`, default `scrapy_playwright.headers.use_scrapy_headers`) - The path to a coroutine function (`async def`) that processes headers for a given request + A function (or the path to a function) that processes headers for a given request and returns a dictionary with the headers to be used (note that, depending on the browser, - additional default headers will be sent as well). + additional default headers will be sent as well). Coroutine functions (`async def`) are + supported. The function must return a `dict` object, and receives the following keyword arguments: @@ -156,13 +157,14 @@ TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor" ``` - Please note: + Please note that all requests will appear in the DEBUG level logs, however there will + be no corresponding response log lines for aborted requests. Aborted requests + are counted in the `playwright/request_count/aborted` job stats item. - * All requests will appear in the DEBUG level logs, however there will - be no corresponding response log lines for aborted requests. Aborted requests - are counted in the `playwright/request_count/aborted` job stats item. - * Passing callable objects is only supported when using Scrapy>=2.4. With prior - versions, only strings containing object paths are supported. + ### General note about settings + + For the settings which accept object paths as strings, passing callable objects is + only supported when using Scrapy>=2.4. With prior versions, only strings are supported. ## Basic usage diff --git a/scrapy_playwright/handler.py b/scrapy_playwright/handler.py index 6875e365..2bf484ad 100644 --- a/scrapy_playwright/handler.py +++ b/scrapy_playwright/handler.py @@ -41,26 +41,6 @@ logger = logging.getLogger("scrapy-playwright") -def _make_request_logger(context_name: str) -> Callable: - def _log_request(request: PlaywrightRequest) -> None: - logger.debug( - f"[Context={context_name}] Request: <{request.method.upper()} {request.url}> " - f"(resource type: {request.resource_type}, referrer: {request.headers.get('referer')})" - ) - - return _log_request - - -def _make_response_logger(context_name: str) -> Callable: - def _log_request(response: PlaywrightResponse) -> None: - logger.debug( - f"[Context={context_name}] Response: <{response.status} {response.url}> " - f"(referrer: {response.headers.get('referer')})" - ) - - return _log_request - - class ScrapyPlaywrightDownloadHandler(HTTPDownloadHandler): def __init__(self, crawler: Crawler) -> None: super().__init__(settings=crawler.settings, crawler=crawler) @@ -289,9 +269,7 @@ async def _apply_page_methods(self, page: Page, request: Request) -> None: except AttributeError: logger.warning(f"Ignoring {repr(pm)}: could not find method") else: - pm.result = method(*pm.args, **pm.kwargs) - if isinstance(pm.result, Awaitable): - pm.result = await pm.result + pm.result = await _await_if_necessary(method(*pm.args, **pm.kwargs)) await page.wait_for_load_state(timeout=self.default_navigation_timeout) else: logger.warning(f"Ignoring {repr(pm)}: expected PageMethod, got {repr(type(pm))}") @@ -333,16 +311,14 @@ def _make_request_handler( async def _request_handler(route: Route, playwright_request: PlaywrightRequest) -> None: """Override request headers, method and body.""" if self.abort_request: - should_abort = self.abort_request(playwright_request) - if isinstance(should_abort, Awaitable): - should_abort = await should_abort + should_abort = await _await_if_necessary(self.abort_request(playwright_request)) if should_abort: await route.abort() self.stats.inc_value("playwright/request_count/aborted") return None - processed_headers = await self.process_request_headers( - self.browser_type, playwright_request, scrapy_headers + processed_headers = await _await_if_necessary( + self.process_request_headers(self.browser_type, playwright_request, scrapy_headers) ) # the request that reaches the callback should contain the headers that were sent @@ -368,6 +344,32 @@ async def _request_handler(route: Route, playwright_request: PlaywrightRequest) return _request_handler +async def _await_if_necessary(obj): + if isinstance(obj, Awaitable): + return await obj + return obj + + +def _make_request_logger(context_name: str) -> Callable: + def _log_request(request: PlaywrightRequest) -> None: + logger.debug( + f"[Context={context_name}] Request: <{request.method.upper()} {request.url}> " + f"(resource type: {request.resource_type}, referrer: {request.headers.get('referer')})" + ) + + return _log_request + + +def _make_response_logger(context_name: str) -> Callable: + def _log_request(response: PlaywrightResponse) -> None: + logger.debug( + f"[Context={context_name}] Response: <{response.status} {response.url}> " + f"(referrer: {response.headers.get('referer')})" + ) + + return _log_request + + def _possible_encodings(headers: Headers, text: str) -> Generator[str, None, None]: if headers.get("content-type"): content_type = to_unicode(headers["content-type"])