Skip to content

Commit

Permalink
Accept sync functions to process headers (#87)
Browse files Browse the repository at this point in the history
  • Loading branch information
elacuesta authored May 6, 2022
1 parent afef146 commit fa7d5f1
Show file tree
Hide file tree
Showing 2 changed files with 43 additions and 39 deletions.
24 changes: 13 additions & 11 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -38,9 +38,9 @@ $ pip install scrapy-playwright
Please see the [changelog.md](changelog.md) file.


## Configuration
## Activation

Replace the default `http` and `https` Download Handlers through
Replace the default `http` and/or `https` Download Handlers through
[`DOWNLOAD_HANDLERS`](https://docs.scrapy.org/en/latest/topics/settings.html):

```python
Expand All @@ -60,7 +60,7 @@ Also, be sure to [install the `asyncio`-based Twisted reactor](https://docs.scra
TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
```

### Settings
## Settings

`scrapy-playwright` accepts the following settings:

Expand Down Expand Up @@ -99,9 +99,10 @@ TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"

* `PLAYWRIGHT_PROCESS_REQUEST_HEADERS` (type `Union[Callable, str]`, default `scrapy_playwright.headers.use_scrapy_headers`)

The path to a coroutine function (`async def`) that processes headers for a given request
A function (or the path to a function) that processes headers for a given request
and returns a dictionary with the headers to be used (note that, depending on the browser,
additional default headers will be sent as well).
additional default headers will be sent as well). Coroutine functions (`async def`) are
supported.

The function must return a `dict` object, and receives the following keyword arguments:

Expand Down Expand Up @@ -156,13 +157,14 @@ TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"

```

Please note:
Please note that all requests will appear in the DEBUG level logs, however there will
be no corresponding response log lines for aborted requests. Aborted requests
are counted in the `playwright/request_count/aborted` job stats item.

* All requests will appear in the DEBUG level logs, however there will
be no corresponding response log lines for aborted requests. Aborted requests
are counted in the `playwright/request_count/aborted` job stats item.
* Passing callable objects is only supported when using Scrapy>=2.4. With prior
versions, only strings containing object paths are supported.
### General note about settings

For the settings which accept object paths as strings, passing callable objects is
only supported when using Scrapy>=2.4. With prior versions, only strings are supported.


## Basic usage
Expand Down
58 changes: 30 additions & 28 deletions scrapy_playwright/handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,26 +41,6 @@
logger = logging.getLogger("scrapy-playwright")


def _make_request_logger(context_name: str) -> Callable:
def _log_request(request: PlaywrightRequest) -> None:
logger.debug(
f"[Context={context_name}] Request: <{request.method.upper()} {request.url}> "
f"(resource type: {request.resource_type}, referrer: {request.headers.get('referer')})"
)

return _log_request


def _make_response_logger(context_name: str) -> Callable:
def _log_request(response: PlaywrightResponse) -> None:
logger.debug(
f"[Context={context_name}] Response: <{response.status} {response.url}> "
f"(referrer: {response.headers.get('referer')})"
)

return _log_request


class ScrapyPlaywrightDownloadHandler(HTTPDownloadHandler):
def __init__(self, crawler: Crawler) -> None:
super().__init__(settings=crawler.settings, crawler=crawler)
Expand Down Expand Up @@ -289,9 +269,7 @@ async def _apply_page_methods(self, page: Page, request: Request) -> None:
except AttributeError:
logger.warning(f"Ignoring {repr(pm)}: could not find method")
else:
pm.result = method(*pm.args, **pm.kwargs)
if isinstance(pm.result, Awaitable):
pm.result = await pm.result
pm.result = await _await_if_necessary(method(*pm.args, **pm.kwargs))
await page.wait_for_load_state(timeout=self.default_navigation_timeout)
else:
logger.warning(f"Ignoring {repr(pm)}: expected PageMethod, got {repr(type(pm))}")
Expand Down Expand Up @@ -333,16 +311,14 @@ def _make_request_handler(
async def _request_handler(route: Route, playwright_request: PlaywrightRequest) -> None:
"""Override request headers, method and body."""
if self.abort_request:
should_abort = self.abort_request(playwright_request)
if isinstance(should_abort, Awaitable):
should_abort = await should_abort
should_abort = await _await_if_necessary(self.abort_request(playwright_request))
if should_abort:
await route.abort()
self.stats.inc_value("playwright/request_count/aborted")
return None

processed_headers = await self.process_request_headers(
self.browser_type, playwright_request, scrapy_headers
processed_headers = await _await_if_necessary(
self.process_request_headers(self.browser_type, playwright_request, scrapy_headers)
)

# the request that reaches the callback should contain the headers that were sent
Expand All @@ -368,6 +344,32 @@ async def _request_handler(route: Route, playwright_request: PlaywrightRequest)
return _request_handler


async def _await_if_necessary(obj):
if isinstance(obj, Awaitable):
return await obj
return obj


def _make_request_logger(context_name: str) -> Callable:
def _log_request(request: PlaywrightRequest) -> None:
logger.debug(
f"[Context={context_name}] Request: <{request.method.upper()} {request.url}> "
f"(resource type: {request.resource_type}, referrer: {request.headers.get('referer')})"
)

return _log_request


def _make_response_logger(context_name: str) -> Callable:
def _log_request(response: PlaywrightResponse) -> None:
logger.debug(
f"[Context={context_name}] Response: <{response.status} {response.url}> "
f"(referrer: {response.headers.get('referer')})"
)

return _log_request


def _possible_encodings(headers: Headers, text: str) -> Generator[str, None, None]:
if headers.get("content-type"):
content_type = to_unicode(headers["content-type"])
Expand Down

0 comments on commit fa7d5f1

Please sign in to comment.