Accept sync functions to process headers (#87)

scrapy-plugins · May 6, 2022 · fa7d5f1 · fa7d5f1
1 parent afef146
commit fa7d5f1
Show file tree

Hide file tree

Showing 2 changed files with 43 additions and 39 deletions.
diff --git a/README.md b/README.md
@@ -38,9 +38,9 @@ $ pip install scrapy-playwright
 Please see the [changelog.md](changelog.md) file.
 
 
-## Configuration
+## Activation
 
-Replace the default `http` and `https` Download Handlers through
+Replace the default `http` and/or `https` Download Handlers through
 [`DOWNLOAD_HANDLERS`](https://docs.scrapy.org/en/latest/topics/settings.html):
 
 ```python
@@ -60,7 +60,7 @@ Also, be sure to [install the `asyncio`-based Twisted reactor](https://docs.scra
 TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
 ```
 
-### Settings
+## Settings
 
 `scrapy-playwright` accepts the following settings:
 
@@ -99,9 +99,10 @@ TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
 
 * `PLAYWRIGHT_PROCESS_REQUEST_HEADERS` (type `Union[Callable, str]`, default `scrapy_playwright.headers.use_scrapy_headers`)
 
-    The path to a coroutine function (`async def`) that processes headers for a given request
+    A function (or the path to a function) that processes headers for a given request
     and returns a dictionary with the headers to be used (note that, depending on the browser,
-    additional default headers will be sent as well).
+    additional default headers will be sent as well). Coroutine functions (`async def`) are
+    supported.
 
     The function must return a `dict` object, and receives the following keyword arguments:
 
@@ -156,13 +157,14 @@ TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
 
     ```
 
-    Please note:
+    Please note that all requests will appear in the DEBUG level logs, however there will
+    be no corresponding response log lines for aborted requests. Aborted requests
+    are counted in the `playwright/request_count/aborted` job stats item.
 
-    * All requests will appear in the DEBUG level logs, however there will
-      be no corresponding response log lines for aborted requests. Aborted requests
-      are counted in the `playwright/request_count/aborted` job stats item.
-    * Passing callable objects is only supported when using Scrapy>=2.4. With prior
-      versions, only strings containing object paths are supported.
+    ### General note about settings
+
+    For the settings which accept object paths as strings, passing callable objects is
+    only supported when using Scrapy>=2.4. With prior versions, only strings are supported.
 
 
 ## Basic usage

diff --git a/scrapy_playwright/handler.py b/scrapy_playwright/handler.py
@@ -41,26 +41,6 @@
 logger = logging.getLogger("scrapy-playwright")
 
 
-def _make_request_logger(context_name: str) -> Callable:
-    def _log_request(request: PlaywrightRequest) -> None:
-        logger.debug(
-            f"[Context={context_name}] Request: <{request.method.upper()} {request.url}> "
-            f"(resource type: {request.resource_type}, referrer: {request.headers.get('referer')})"
-        )
-
-    return _log_request
-
-
-def _make_response_logger(context_name: str) -> Callable:
-    def _log_request(response: PlaywrightResponse) -> None:
-        logger.debug(
-            f"[Context={context_name}] Response: <{response.status} {response.url}> "
-            f"(referrer: {response.headers.get('referer')})"
-        )
-
-    return _log_request
-
-
 class ScrapyPlaywrightDownloadHandler(HTTPDownloadHandler):
     def __init__(self, crawler: Crawler) -> None:
         super().__init__(settings=crawler.settings, crawler=crawler)
@@ -289,9 +269,7 @@ async def _apply_page_methods(self, page: Page, request: Request) -> None:
                 except AttributeError:
                     logger.warning(f"Ignoring {repr(pm)}: could not find method")
                 else:
-                    pm.result = method(*pm.args, **pm.kwargs)
-                    if isinstance(pm.result, Awaitable):
-                        pm.result = await pm.result
+                    pm.result = await _await_if_necessary(method(*pm.args, **pm.kwargs))
                     await page.wait_for_load_state(timeout=self.default_navigation_timeout)
             else:
                 logger.warning(f"Ignoring {repr(pm)}: expected PageMethod, got {repr(type(pm))}")
@@ -333,16 +311,14 @@ def _make_request_handler(
         async def _request_handler(route: Route, playwright_request: PlaywrightRequest) -> None:
             """Override request headers, method and body."""
             if self.abort_request:
-                should_abort = self.abort_request(playwright_request)
-                if isinstance(should_abort, Awaitable):
-                    should_abort = await should_abort
+                should_abort = await _await_if_necessary(self.abort_request(playwright_request))
                 if should_abort:
                     await route.abort()
                     self.stats.inc_value("playwright/request_count/aborted")
                     return None
 
-            processed_headers = await self.process_request_headers(
-                self.browser_type, playwright_request, scrapy_headers
+            processed_headers = await _await_if_necessary(
+                self.process_request_headers(self.browser_type, playwright_request, scrapy_headers)
             )
 
             # the request that reaches the callback should contain the headers that were sent
@@ -368,6 +344,32 @@ async def _request_handler(route: Route, playwright_request: PlaywrightRequest)
         return _request_handler
 
 
+async def _await_if_necessary(obj):
+    if isinstance(obj, Awaitable):
+        return await obj
+    return obj
+
+
+def _make_request_logger(context_name: str) -> Callable:
+    def _log_request(request: PlaywrightRequest) -> None:
+        logger.debug(
+            f"[Context={context_name}] Request: <{request.method.upper()} {request.url}> "
+            f"(resource type: {request.resource_type}, referrer: {request.headers.get('referer')})"
+        )
+
+    return _log_request
+
+
+def _make_response_logger(context_name: str) -> Callable:
+    def _log_request(response: PlaywrightResponse) -> None:
+        logger.debug(
+            f"[Context={context_name}] Response: <{response.status} {response.url}> "
+            f"(referrer: {response.headers.get('referer')})"
+        )
+
+    return _log_request
+
+
 def _possible_encodings(headers: Headers, text: str) -> Generator[str, None, None]:
     if headers.get("content-type"):
         content_type = to_unicode(headers["content-type"])