Set playwright_page request meta key early (#91)

scrapy-plugins · May 9, 2022 · 251bdc7 · 251bdc7
1 parent fa7d5f1
commit 251bdc7
Show file tree

Hide file tree

Showing 2 changed files with 9 additions and 7 deletions.
diff --git a/README.md b/README.md
@@ -239,8 +239,10 @@ class AwesomeSpiderWithPage(scrapy.Spider):
 * In order to avoid memory issues, it is recommended to manually close the page
   by awaiting the `Page.close` coroutine.
 * Be careful about leaving pages unclosed, as they count towards the limit set by
-  `PLAYWRIGHT_MAX_PAGES_PER_CONTEXT`. It's recommended to set a Request errback to
-  make sure pages are closed even if a request fails.
+  `PLAYWRIGHT_MAX_PAGES_PER_CONTEXT`. When passing `playwright_include_page=True`,
+  it's recommended to set a Request errback to make sure pages are closed even
+  if a request fails (if `playwright_include_page=False` or unset, pages are
+  automatically closed upon encountering an exception).
 * Any network operations resulting from awaiting a coroutine on a `Page` object
   (`goto`, `go_back`, etc) will be executed directly by Playwright, bypassing the
   Scrapy request workflow (Scheduler, Middlewares, etc).

diff --git a/scrapy_playwright/handler.py b/scrapy_playwright/handler.py
@@ -211,17 +211,17 @@ async def _download_request(self, request: Request, spider: Spider) -> Response:
             return result
 
     async def _download_request_with_page(self, request: Request, page: Page) -> Response:
+        # set this early to make it available in errbacks even if something fails
+        if request.meta.get("playwright_include_page"):
+            request.meta["playwright_page"] = page
+
         start_time = time()
         response = await page.goto(request.url)
-
         await self._apply_page_methods(page, request)
-
         body_str = await page.content()
         request.meta["download_latency"] = time() - start_time
 
-        if request.meta.get("playwright_include_page"):
-            request.meta["playwright_page"] = page
-        else:
+        if not request.meta.get("playwright_include_page"):
             await page.close()
             self.stats.inc_value("playwright/page_count/closed")