diff --git a/scrapy_playwright/handler.py b/scrapy_playwright/handler.py index 2b2ebd58..be621563 100644 --- a/scrapy_playwright/handler.py +++ b/scrapy_playwright/handler.py @@ -357,6 +357,13 @@ async def _download_request(self, request: Request, spider: Spider) -> Response: page=page, request=request, spider=spider, context_name=context_name ) + # We need to identify the Playwright request that matches the Scrapy request + # in order to override method and body if necessary. + # Checking the URL and Request.is_navigation_request() is not enough, e.g. + # requests produced by submitting forms can produce false positives. + # Let's track only the first request that matches the above conditions. + initial_request_done = asyncio.Event() + await page.unroute("**") await page.route( "**", @@ -368,6 +375,7 @@ async def _download_request(self, request: Request, spider: Spider) -> Response: body=request.body, encoding=request.encoding, spider=spider, + initial_request_done=initial_request_done, ), ) @@ -637,6 +645,7 @@ def _make_request_handler( body: Optional[bytes], encoding: str, spider: Spider, + initial_request_done: asyncio.Event, ) -> Callable: async def _request_handler(route: Route, playwright_request: PlaywrightRequest) -> None: """Override request headers, method and body.""" @@ -676,7 +685,9 @@ async def _request_handler(route: Route, playwright_request: PlaywrightRequest) if ( playwright_request.url.rstrip("/") == url.rstrip("/") and playwright_request.is_navigation_request() + and not initial_request_done.is_set() ): + initial_request_done.set() if method.upper() != playwright_request.method.upper(): overrides["method"] = method if body: diff --git a/tests/tests_asyncio/test_playwright_requests.py b/tests/tests_asyncio/test_playwright_requests.py index c9e87298..379c6bd3 100644 --- a/tests/tests_asyncio/test_playwright_requests.py +++ b/tests/tests_asyncio/test_playwright_requests.py @@ -1,3 +1,4 @@ +import asyncio import json import logging import platform @@ -112,6 +113,7 @@ async def test_route_continue_exception(self, logger): async with make_handler({"PLAYWRIGHT_BROWSER_TYPE": self.browser_type}) as handler: scrapy_request = Request(url="https://example.org", method="GET") spider = Spider("foo") + initial_request_done = asyncio.Event() req_handler = handler._make_request_handler( context_name=DEFAULT_CONTEXT_NAME, method=scrapy_request.method, @@ -120,6 +122,7 @@ async def test_route_continue_exception(self, logger): body=None, encoding="utf-8", spider=spider, + initial_request_done=initial_request_done, ) route = MagicMock() playwright_request = AsyncMock()