Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Accept sync functions to process headers #87

Merged
merged 1 commit into from
May 6, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 13 additions & 11 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -38,9 +38,9 @@ $ pip install scrapy-playwright
Please see the [changelog.md](changelog.md) file.


## Configuration
## Activation

Replace the default `http` and `https` Download Handlers through
Replace the default `http` and/or `https` Download Handlers through
[`DOWNLOAD_HANDLERS`](https://docs.scrapy.org/en/latest/topics/settings.html):

```python
Expand All @@ -60,7 +60,7 @@ Also, be sure to [install the `asyncio`-based Twisted reactor](https://docs.scra
TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
```

### Settings
## Settings

`scrapy-playwright` accepts the following settings:

Expand Down Expand Up @@ -99,9 +99,10 @@ TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"

* `PLAYWRIGHT_PROCESS_REQUEST_HEADERS` (type `Union[Callable, str]`, default `scrapy_playwright.headers.use_scrapy_headers`)

The path to a coroutine function (`async def`) that processes headers for a given request
A function (or the path to a function) that processes headers for a given request
and returns a dictionary with the headers to be used (note that, depending on the browser,
additional default headers will be sent as well).
additional default headers will be sent as well). Coroutine functions (`async def`) are
supported.

The function must return a `dict` object, and receives the following keyword arguments:

Expand Down Expand Up @@ -156,13 +157,14 @@ TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"

```

Please note:
Please note that all requests will appear in the DEBUG level logs, however there will
be no corresponding response log lines for aborted requests. Aborted requests
are counted in the `playwright/request_count/aborted` job stats item.

* All requests will appear in the DEBUG level logs, however there will
be no corresponding response log lines for aborted requests. Aborted requests
are counted in the `playwright/request_count/aborted` job stats item.
* Passing callable objects is only supported when using Scrapy>=2.4. With prior
versions, only strings containing object paths are supported.
### General note about settings

For the settings which accept object paths as strings, passing callable objects is
only supported when using Scrapy>=2.4. With prior versions, only strings are supported.


## Basic usage
Expand Down
58 changes: 30 additions & 28 deletions scrapy_playwright/handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,26 +41,6 @@
logger = logging.getLogger("scrapy-playwright")


def _make_request_logger(context_name: str) -> Callable:
def _log_request(request: PlaywrightRequest) -> None:
logger.debug(
f"[Context={context_name}] Request: <{request.method.upper()} {request.url}> "
f"(resource type: {request.resource_type}, referrer: {request.headers.get('referer')})"
)

return _log_request


def _make_response_logger(context_name: str) -> Callable:
def _log_request(response: PlaywrightResponse) -> None:
logger.debug(
f"[Context={context_name}] Response: <{response.status} {response.url}> "
f"(referrer: {response.headers.get('referer')})"
)

return _log_request


class ScrapyPlaywrightDownloadHandler(HTTPDownloadHandler):
def __init__(self, crawler: Crawler) -> None:
super().__init__(settings=crawler.settings, crawler=crawler)
Expand Down Expand Up @@ -289,9 +269,7 @@ async def _apply_page_methods(self, page: Page, request: Request) -> None:
except AttributeError:
logger.warning(f"Ignoring {repr(pm)}: could not find method")
else:
pm.result = method(*pm.args, **pm.kwargs)
if isinstance(pm.result, Awaitable):
pm.result = await pm.result
pm.result = await _await_if_necessary(method(*pm.args, **pm.kwargs))
await page.wait_for_load_state(timeout=self.default_navigation_timeout)
else:
logger.warning(f"Ignoring {repr(pm)}: expected PageMethod, got {repr(type(pm))}")
Expand Down Expand Up @@ -333,16 +311,14 @@ def _make_request_handler(
async def _request_handler(route: Route, playwright_request: PlaywrightRequest) -> None:
"""Override request headers, method and body."""
if self.abort_request:
should_abort = self.abort_request(playwright_request)
if isinstance(should_abort, Awaitable):
should_abort = await should_abort
should_abort = await _await_if_necessary(self.abort_request(playwright_request))
if should_abort:
await route.abort()
self.stats.inc_value("playwright/request_count/aborted")
return None

processed_headers = await self.process_request_headers(
self.browser_type, playwright_request, scrapy_headers
processed_headers = await _await_if_necessary(
self.process_request_headers(self.browser_type, playwright_request, scrapy_headers)
)

# the request that reaches the callback should contain the headers that were sent
Expand All @@ -368,6 +344,32 @@ async def _request_handler(route: Route, playwright_request: PlaywrightRequest)
return _request_handler


async def _await_if_necessary(obj):
if isinstance(obj, Awaitable):
return await obj
return obj


def _make_request_logger(context_name: str) -> Callable:
def _log_request(request: PlaywrightRequest) -> None:
logger.debug(
f"[Context={context_name}] Request: <{request.method.upper()} {request.url}> "
f"(resource type: {request.resource_type}, referrer: {request.headers.get('referer')})"
)

return _log_request


def _make_response_logger(context_name: str) -> Callable:
def _log_request(response: PlaywrightResponse) -> None:
logger.debug(
f"[Context={context_name}] Response: <{response.status} {response.url}> "
f"(referrer: {response.headers.get('referer')})"
)

return _log_request


def _possible_encodings(headers: Headers, text: str) -> Generator[str, None, None]:
if headers.get("content-type"):
content_type = to_unicode(headers["content-type"])
Expand Down