Override User-Agent header (#29)

* Use the browser's user agent if Scrapy's is None * Add note about the user agent to the readme * Content-Type header in tests
scrapy-plugins · Oct 18, 2021 · 9981cd8 · 9981cd8
1 parent 5bcac0f
commit 9981cd8
Show file tree

Hide file tree

Showing 4 changed files with 64 additions and 7 deletions.
diff --git a/README.md b/README.md
@@ -130,6 +130,14 @@ class AwesomeSpider(scrapy.Spider):
         yield {"url": response.url}
 ```
 
+### Notes about the User-Agent header
+
+By default, outgoing requests include the `User-Agent` set by Scrapy (either with the
+`USER_AGENT` or `DEFAULT_REQUEST_HEADERS` settings or via the `Request.headers` attribute).
+This could cause some sites to react in unexpected ways, for instance if the user agent
+does not match the Browser being used. If you prefer to send the `User-Agent` from the Browser,
+set the Scrapy user agent to `None`.
+
 
 ## Receiving the Page object in the callback
 

diff --git a/scrapy_playwright/handler.py b/scrapy_playwright/handler.py
@@ -217,12 +217,13 @@ def _make_request_handler(
     ) -> Callable:
         def request_handler(route: Route, pw_request: PlaywrightRequest) -> None:
             """Override request headers, method and body."""
+            headers.setdefault("user-agent", pw_request.headers.get("user-agent"))
             if pw_request.url == url:
                 overrides: dict = {"method": method, "headers": headers}
                 if body is not None:
                     overrides["post_data"] = body.decode(encoding)
-                # otherwise this fails with playwright.helper.Error: NS_ERROR_NET_RESET
                 if self.browser_type == "firefox":
+                    # otherwise this fails with playwright.helper.Error: NS_ERROR_NET_RESET
                     overrides["headers"]["host"] = urlparse(pw_request.url).netloc
             else:
                 overrides = {"headers": pw_request.headers.copy()}

diff --git a/tests/mockserver.py b/tests/mockserver.py
@@ -1,3 +1,4 @@
+import json
 import re
 import sys
 import time
@@ -40,11 +41,20 @@ def do_POST(self):
         self.wfile.write(body)
 
     def do_GET(self):
-        """Take a long time to reply"""
-        time.sleep(2)
+        body = "{}"
+        if self.path == "/headers":
+            body = json.dumps(dict(self.headers), indent=4)
+        else:
+            delay_match = re.match(r"^/delay/(\d+)$", self.path)
+            if delay_match:
+                delay = int(delay_match.group(1))
+                print(f"Sleeping {delay} seconds...")
+                time.sleep(delay)
+                body = json.dumps({"delay": delay})
         self.send_response(200)
+        self.send_header("Content-Type", "application/json")
         self.end_headers()
-        self.wfile.write(b"Hello world!")
+        self.wfile.write(body.encode())
 
 
 class MockServer:
@@ -59,5 +69,12 @@ def __exit__(self, exc_type, exc_value, traceback):
         self.httpd.shutdown()
         self.thread.join()
 
-    def urljoin(self, url):
+    def urljoin(self, url: str) -> str:
         return urljoin("http://{}:{}".format(self.address, self.port), url)
+
+
+if __name__ == "__main__":
+    with MockServer() as server:
+        print(f"Listening at http://{server.address}:{server.port}")
+        while True:
+            pass
diff --git a/tests/test_playwright_requests.py b/tests/test_playwright_requests.py
@@ -1,3 +1,4 @@
+import json
 import logging
 import platform
 import subprocess
@@ -57,7 +58,7 @@ async def test_post_request(self):
         async with make_handler({"PLAYWRIGHT_BROWSER_TYPE": self.browser_type}) as handler:
             with MockServer() as server:
                 req = FormRequest(
-                    server.urljoin("/"), meta={"playwright": True}, formdata={"foo": "bar"}
+                    server.urljoin("/delay/2"), meta={"playwright": True}, formdata={"foo": "bar"}
                 )
                 resp = await handler._download_request(req, Spider("foo"))
 
@@ -124,7 +125,7 @@ async def test_timeout(self):
         }
         async with make_handler(settings_dict) as handler:
             with MockServer() as server:
-                req = Request(server.urljoin("/index.html"), meta={"playwright": True})
+                req = Request(server.urljoin("/delay/2"), meta={"playwright": True})
                 with pytest.raises(TimeoutError):
                     await handler._download_request(req, Spider("foo"))
 
@@ -193,6 +194,36 @@ async def test_page_coroutine_pdf(self):
                 assert pdf_file.file.read() == req.meta["playwright_page_coroutines"]["pdf"].result
                 assert get_mimetype(pdf_file) == "application/pdf"
 
+    @pytest.mark.asyncio
+    async def test_user_agent(self):
+        settings_dict = {
+            "PLAYWRIGHT_BROWSER_TYPE": self.browser_type,
+            "PLAYWRIGHT_CONTEXTS": {"default": {"user_agent": self.browser_type}},
+            "USER_AGENT": None,
+        }
+        async with make_handler(settings_dict) as handler:
+            with MockServer() as server:
+                # if Scrapy's user agent is None, use the one from the Browser
+                req = Request(
+                    url=server.urljoin("/headers"),
+                    meta={"playwright": True},
+                )
+                resp = await handler._download_request(req, Spider("foo"))
+                headers = json.loads(resp.css("pre::text").get())
+                headers = {key.lower(): value for key, value in headers.items()}
+                assert headers["user-agent"] == self.browser_type
+
+                # if Scrapy's user agent is set to some value, use it
+                req = Request(
+                    url=server.urljoin("/headers"),
+                    meta={"playwright": True},
+                    headers={"User-Agent": "foobar"},
+                )
+                resp = await handler._download_request(req, Spider("foo"))
+                headers = json.loads(resp.css("pre::text").get())
+                headers = {key.lower(): value for key, value in headers.items()}
+                assert headers["user-agent"] == "foobar"
+
     @pytest.mark.asyncio
     async def test_event_handler_dialog_callable(self):
         async with make_handler({"PLAYWRIGHT_BROWSER_TYPE": self.browser_type}) as handler: