Skip to content

Commit

Permalink
Override User-Agent header (#29)
Browse files Browse the repository at this point in the history
* Use the browser's user agent if Scrapy's is None

* Add note about the user agent to the readme

* Content-Type header in tests
  • Loading branch information
elacuesta authored Oct 18, 2021
1 parent 5bcac0f commit 9981cd8
Show file tree
Hide file tree
Showing 4 changed files with 64 additions and 7 deletions.
8 changes: 8 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,14 @@ class AwesomeSpider(scrapy.Spider):
yield {"url": response.url}
```

### Notes about the User-Agent header

By default, outgoing requests include the `User-Agent` set by Scrapy (either with the
`USER_AGENT` or `DEFAULT_REQUEST_HEADERS` settings or via the `Request.headers` attribute).
This could cause some sites to react in unexpected ways, for instance if the user agent
does not match the Browser being used. If you prefer to send the `User-Agent` from the Browser,
set the Scrapy user agent to `None`.


## Receiving the Page object in the callback

Expand Down
3 changes: 2 additions & 1 deletion scrapy_playwright/handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -217,12 +217,13 @@ def _make_request_handler(
) -> Callable:
def request_handler(route: Route, pw_request: PlaywrightRequest) -> None:
"""Override request headers, method and body."""
headers.setdefault("user-agent", pw_request.headers.get("user-agent"))
if pw_request.url == url:
overrides: dict = {"method": method, "headers": headers}
if body is not None:
overrides["post_data"] = body.decode(encoding)
# otherwise this fails with playwright.helper.Error: NS_ERROR_NET_RESET
if self.browser_type == "firefox":
# otherwise this fails with playwright.helper.Error: NS_ERROR_NET_RESET
overrides["headers"]["host"] = urlparse(pw_request.url).netloc
else:
overrides = {"headers": pw_request.headers.copy()}
Expand Down
25 changes: 21 additions & 4 deletions tests/mockserver.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import json
import re
import sys
import time
Expand Down Expand Up @@ -40,11 +41,20 @@ def do_POST(self):
self.wfile.write(body)

def do_GET(self):
"""Take a long time to reply"""
time.sleep(2)
body = "{}"
if self.path == "/headers":
body = json.dumps(dict(self.headers), indent=4)
else:
delay_match = re.match(r"^/delay/(\d+)$", self.path)
if delay_match:
delay = int(delay_match.group(1))
print(f"Sleeping {delay} seconds...")
time.sleep(delay)
body = json.dumps({"delay": delay})
self.send_response(200)
self.send_header("Content-Type", "application/json")
self.end_headers()
self.wfile.write(b"Hello world!")
self.wfile.write(body.encode())


class MockServer:
Expand All @@ -59,5 +69,12 @@ def __exit__(self, exc_type, exc_value, traceback):
self.httpd.shutdown()
self.thread.join()

def urljoin(self, url):
def urljoin(self, url: str) -> str:
return urljoin("http://{}:{}".format(self.address, self.port), url)


if __name__ == "__main__":
with MockServer() as server:
print(f"Listening at http://{server.address}:{server.port}")
while True:
pass
35 changes: 33 additions & 2 deletions tests/test_playwright_requests.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import json
import logging
import platform
import subprocess
Expand Down Expand Up @@ -57,7 +58,7 @@ async def test_post_request(self):
async with make_handler({"PLAYWRIGHT_BROWSER_TYPE": self.browser_type}) as handler:
with MockServer() as server:
req = FormRequest(
server.urljoin("/"), meta={"playwright": True}, formdata={"foo": "bar"}
server.urljoin("/delay/2"), meta={"playwright": True}, formdata={"foo": "bar"}
)
resp = await handler._download_request(req, Spider("foo"))

Expand Down Expand Up @@ -124,7 +125,7 @@ async def test_timeout(self):
}
async with make_handler(settings_dict) as handler:
with MockServer() as server:
req = Request(server.urljoin("/index.html"), meta={"playwright": True})
req = Request(server.urljoin("/delay/2"), meta={"playwright": True})
with pytest.raises(TimeoutError):
await handler._download_request(req, Spider("foo"))

Expand Down Expand Up @@ -193,6 +194,36 @@ async def test_page_coroutine_pdf(self):
assert pdf_file.file.read() == req.meta["playwright_page_coroutines"]["pdf"].result
assert get_mimetype(pdf_file) == "application/pdf"

@pytest.mark.asyncio
async def test_user_agent(self):
settings_dict = {
"PLAYWRIGHT_BROWSER_TYPE": self.browser_type,
"PLAYWRIGHT_CONTEXTS": {"default": {"user_agent": self.browser_type}},
"USER_AGENT": None,
}
async with make_handler(settings_dict) as handler:
with MockServer() as server:
# if Scrapy's user agent is None, use the one from the Browser
req = Request(
url=server.urljoin("/headers"),
meta={"playwright": True},
)
resp = await handler._download_request(req, Spider("foo"))
headers = json.loads(resp.css("pre::text").get())
headers = {key.lower(): value for key, value in headers.items()}
assert headers["user-agent"] == self.browser_type

# if Scrapy's user agent is set to some value, use it
req = Request(
url=server.urljoin("/headers"),
meta={"playwright": True},
headers={"User-Agent": "foobar"},
)
resp = await handler._download_request(req, Spider("foo"))
headers = json.loads(resp.css("pre::text").get())
headers = {key.lower(): value for key, value in headers.items()}
assert headers["user-agent"] == "foobar"

@pytest.mark.asyncio
async def test_event_handler_dialog_callable(self):
async with make_handler({"PLAYWRIGHT_BROWSER_TYPE": self.browser_type}) as handler:
Expand Down

0 comments on commit 9981cd8

Please sign in to comment.