Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Connect to remote browser using BrowserType.connect #283

Merged
merged 12 commits into from
Jul 6, 2024
5 changes: 5 additions & 0 deletions .github/workflows/tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,11 @@ jobs:
with:
python-version: ${{ matrix.python-version }}

- name: Set up node
uses: actions/setup-node@v4
with:
node-version: 18

- name: Install tox
run: pip install tox

Expand Down
5 changes: 5 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -17,3 +17,8 @@ coverage.xml
coverage-*.xml
coverage-asyncio/
coverage-twisted/

# nodejs stuff
node_modules/
package-lock.json
package.json
44 changes: 41 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -168,14 +168,17 @@ Type `Optional[str]`, default `None`
The endpoint of a remote Chromium browser to connect using the
[Chrome DevTools Protocol](https://chromedevtools.github.io/devtools-protocol/),
via [`BrowserType.connect_over_cdp`](https://playwright.dev/python/docs/api/class-browsertype#browser-type-connect-over-cdp).

```python
PLAYWRIGHT_CDP_URL = "http://localhost:9222"
```

If this setting is used:
* all non-persistent contexts will be created on the connected remote browser
* the `PLAYWRIGHT_LAUNCH_OPTIONS` setting is ignored
* the `PLAYWRIGHT_BROWSER_TYPE` setting must not be set to a value different than "chromium"

```python
PLAYWRIGHT_CDP_URL = "http://localhost:9222"
```
**This settings CANNOT be used at the same time as `PLAYWRIGHT_CONNECT_URL`**

### `PLAYWRIGHT_CDP_KWARGS`
Type `dict[str, Any]`, default `{}`
Expand All @@ -192,6 +195,41 @@ PLAYWRIGHT_CDP_KWARGS = {
}
```

### `PLAYWRIGHT_CONNECT_URL`
Type `Optional[str]`, default `None`

URL of a remote Playwright browser instance to connect using
[`BrowserType.connect`](https://playwright.dev/python/docs/api/class-browsertype#browser-type-connect).

From the upstream Playwright docs:
> When connecting to another browser launched via
> [`BrowserType.launchServer`](https://playwright.dev/docs/api/class-browsertype#browser-type-launch-server)
> in Node.js, the major and minor version needs to match the client version (1.2.3 → is compatible with 1.2.x).

```python
PLAYWRIGHT_CONNECT_URL = "ws://localhost:35477/ae1fa0bc325adcfd9600d9f712e9c733"
```

If this setting is used:
* all non-persistent contexts will be created on the connected remote browser
* the `PLAYWRIGHT_LAUNCH_OPTIONS` setting is ignored

**This settings CANNOT be used at the same time as `PLAYWRIGHT_CDP_URL`**

### `PLAYWRIGHT_CONNECT_KWARGS`
Type `dict[str, Any]`, default `{}`

Additional keyword arguments to be passed to
[`BrowserType.connect`](https://playwright.dev/python/docs/api/class-browsertype#browser-type-connect)
when using `PLAYWRIGHT_CONNECT_URL`. The `ws_endpoint` key is always ignored,
`PLAYWRIGHT_CONNECT_URL` is used instead.

```python
PLAYWRIGHT_CONNECT_KWARGS = {
"slow_mo": 1000,
"timeout": 10 * 1000
}
```

### `PLAYWRIGHT_CONTEXTS`
Type `dict[str, dict]`, default `{}`
Expand Down
36 changes: 28 additions & 8 deletions scrapy_playwright/handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
from scrapy import Spider, signals
from scrapy.core.downloader.handlers.http import HTTPDownloadHandler
from scrapy.crawler import Crawler
from scrapy.exceptions import NotSupported
from scrapy.http import Request, Response
from scrapy.http.headers import Headers
from scrapy.responsetypes import responsetypes
Expand Down Expand Up @@ -69,6 +70,8 @@ class BrowserContextWrapper:
class Config:
cdp_url: Optional[str]
cdp_kwargs: dict
connect_url: Optional[str]
connect_kwargs: dict
browser_type_name: str
launch_options: dict
max_pages_per_context: int
Expand All @@ -78,9 +81,15 @@ class Config:

@classmethod
def from_settings(cls, settings: Settings) -> "Config":
if settings.get("PLAYWRIGHT_CDP_URL") and settings.get("PLAYWRIGHT_CONNECT_URL"):
msg = "Setting both PLAYWRIGHT_CDP_URL and PLAYWRIGHT_CONNECT_URL is not supported"
logger.error(msg)
raise NotSupported(msg)
cfg = cls(
cdp_url=settings.get("PLAYWRIGHT_CDP_URL"),
cdp_kwargs=settings.getdict("PLAYWRIGHT_CDP_KWARGS") or {},
connect_url=settings.get("PLAYWRIGHT_CONNECT_URL"),
connect_kwargs=settings.getdict("PLAYWRIGHT_CONNECT_KWARGS") or {},
browser_type_name=settings.get("PLAYWRIGHT_BROWSER_TYPE") or DEFAULT_BROWSER_TYPE,
launch_options=settings.getdict("PLAYWRIGHT_LAUNCH_OPTIONS") or {},
max_pages_per_context=settings.getint("PLAYWRIGHT_MAX_PAGES_PER_CONTEXT"),
Expand All @@ -91,10 +100,11 @@ def from_settings(cls, settings: Settings) -> "Config":
),
)
cfg.cdp_kwargs.pop("endpoint_url", None)
cfg.connect_kwargs.pop("ws_endpoint", None)
if not cfg.max_pages_per_context:
cfg.max_pages_per_context = settings.getint("CONCURRENT_REQUESTS")
if cfg.cdp_url and cfg.launch_options:
logger.warning("PLAYWRIGHT_CDP_URL is set, ignoring PLAYWRIGHT_LAUNCH_OPTIONS")
if (cfg.cdp_url or cfg.connect_url) and cfg.launch_options:
logger.warning("Connecting to remote browser, ignoring PLAYWRIGHT_LAUNCH_OPTIONS")
return cfg


Expand Down Expand Up @@ -166,7 +176,7 @@ async def _maybe_launch_browser(self) -> None:
self.browser = await self.browser_type.launch(**self.config.launch_options)
logger.info("Browser %s launched", self.browser_type.name)

async def _maybe_connect_devtools(self) -> None:
async def _maybe_connect_remote_devtools(self) -> None:
async with self.browser_launch_lock:
if not hasattr(self, "browser"):
logger.info("Connecting using CDP: %s", self.config.cdp_url)
Expand All @@ -175,6 +185,15 @@ async def _maybe_connect_devtools(self) -> None:
)
logger.info("Connected using CDP: %s", self.config.cdp_url)

async def _maybe_connect_remote(self) -> None:
async with self.browser_launch_lock:
if not hasattr(self, "browser"):
logger.info("Connecting to remote Playwright")
self.browser = await self.browser_type.connect(
self.config.connect_url, **self.config.connect_kwargs
)
logger.info("Connected to remote Playwright")

async def _create_browser_context(
self,
name: str,
Expand All @@ -187,20 +206,21 @@ async def _create_browser_context(
if hasattr(self, "context_semaphore"):
await self.context_semaphore.acquire()
context_kwargs = context_kwargs or {}
persistent = remote = False
if context_kwargs.get(PERSISTENT_CONTEXT_PATH_KEY):
context = await self.browser_type.launch_persistent_context(**context_kwargs)
persistent = True
remote = False
elif self.config.cdp_url:
await self._maybe_connect_devtools()
await self._maybe_connect_remote_devtools()
context = await self.browser.new_context(**context_kwargs)
remote = True
elif self.config.connect_url:
await self._maybe_connect_remote()
context = await self.browser.new_context(**context_kwargs)
persistent = False
remote = True
else:
await self._maybe_launch_browser()
context = await self.browser.new_context(**context_kwargs)
persistent = False
remote = False

context.on(
"close", self._make_close_browser_context_callback(name, persistent, remote, spider)
Expand Down
13 changes: 13 additions & 0 deletions tests/launch_browser_server.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
// used to start a browser server to test the PLAYWRIGHT_CONNECT_URL setting
// usage:
// node launch_browser_server.js PORT WS_PATH

const { chromium } = require('playwright'); // Or 'webkit' or 'firefox'.

(async () => {
const browserServer = await chromium.launchServer({
host: 'localhost',
port: process.argv[2],
wsPath: process.argv[3]
});
})();
74 changes: 62 additions & 12 deletions tests/tests_asyncio/test_remote.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,12 @@
import asyncio
import logging
import random
import re
import subprocess
import time
import uuid
from contextlib import asynccontextmanager
from pathlib import Path
from typing import Tuple
from unittest import IsolatedAsyncioTestCase

Expand All @@ -14,8 +18,8 @@
from tests.mockserver import StaticMockServer


async def _run_chromium() -> Tuple[subprocess.Popen, str]:
"""Run a Croumium instance in a separate process, return the process
async def _run_chromium_devtools() -> Tuple[subprocess.Popen, str]:
"""Run a Chromium instance in a separate process, return the process
object and a string with its devtools endpoint.
"""
async with async_playwright() as playwright:
Expand All @@ -38,32 +42,50 @@ async def _run_chromium() -> Tuple[subprocess.Popen, str]:
return proc, devtools_url


def _run_playwright_browser_server() -> Tuple[subprocess.Popen, str]:
"""Start a Playwright server in a separate process, return the process
object and a string with its websocket endpoint.
Pass fixed port and ws path as arguments instead of allowing Playwright
to choose, for some reason I was unable to capture stdout/stderr :shrug:
"""
port = str(random.randint(60_000, 63_000))
ws_path = str(uuid.uuid4())
launch_server_script_path = str(Path(__file__).parent.parent / "launch_browser_server.js")
command = ["node", launch_server_script_path, port, ws_path]
proc = subprocess.Popen(command) # pylint: disable=consider-using-with
return proc, f"ws://localhost:{port}/{ws_path}"


@asynccontextmanager
async def remote_chromium():
"""Launch a Chromium instance with remote debugging enabled."""
proc = None
devtools_url = None
async def remote_browser(is_chrome_devtools_protocol: bool = True):
"""Launch a remote browser that lasts while in the context."""
proc = url = None
try:
proc, devtools_url = await _run_chromium()
if is_chrome_devtools_protocol:
proc, url = await _run_chromium_devtools()
else:
proc, url = _run_playwright_browser_server()
await asyncio.sleep(1) # allow some time for the browser to start
except Exception:
pass
else:
yield devtools_url
print(f"Browser URL: {url}")
yield url
finally:
if proc:
proc.kill()
proc.communicate()


class TestRemoteDevtools(IsolatedAsyncioTestCase):
class TestRemote(IsolatedAsyncioTestCase):
@pytest.fixture(autouse=True)
def inject_fixtures(self, caplog):
caplog.set_level(logging.DEBUG)
self._caplog = caplog

@allow_windows
async def test_devtools(self):
async with remote_chromium() as devtools_url:
async def test_connect_devtools(self):
async with remote_browser(is_chrome_devtools_protocol=True) as devtools_url:
settings_dict = {
"PLAYWRIGHT_CDP_URL": devtools_url,
"PLAYWRIGHT_LAUNCH_OPTIONS": {"headless": True},
Expand All @@ -76,5 +98,33 @@ async def test_devtools(self):
assert (
"scrapy-playwright",
logging.WARNING,
"PLAYWRIGHT_CDP_URL is set, ignoring PLAYWRIGHT_LAUNCH_OPTIONS",
"Connecting to remote browser, ignoring PLAYWRIGHT_LAUNCH_OPTIONS",
) in self._caplog.record_tuples

@allow_windows
async def test_connect(self):
async with remote_browser(is_chrome_devtools_protocol=False) as browser_url:
settings_dict = {
"PLAYWRIGHT_CONNECT_URL": browser_url,
"PLAYWRIGHT_LAUNCH_OPTIONS": {"headless": True},
}
async with make_handler(settings_dict) as handler:
with StaticMockServer() as server:
req = Request(server.urljoin("/index.html"), meta={"playwright": True})
resp = await handler._download_request(req, Spider("foo"))
assert_correct_response(resp, req)
assert (
"scrapy-playwright",
logging.INFO,
"Connecting to remote Playwright",
) in self._caplog.record_tuples
assert (
"scrapy-playwright",
logging.INFO,
"Connected to remote Playwright",
) in self._caplog.record_tuples
assert (
"scrapy-playwright",
logging.WARNING,
"Connecting to remote browser, ignoring PLAYWRIGHT_LAUNCH_OPTIONS",
) in self._caplog.record_tuples
12 changes: 12 additions & 0 deletions tests/tests_asyncio/test_settings.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
from unittest import IsolatedAsyncioTestCase

import pytest
from scrapy.exceptions import NotSupported
from scrapy.settings import Settings

from scrapy_playwright.handler import Config
Expand Down Expand Up @@ -31,6 +33,16 @@ async def test_max_pages_per_context(self):
config = Config.from_settings(Settings({"CONCURRENT_REQUESTS": 9876}))
assert config.max_pages_per_context == 9876

async def test_connect_remote_urls(self):
with pytest.raises(NotSupported) as exc_info:
Config.from_settings(
Settings({"PLAYWRIGHT_CONNECT_URL": "asdf", "PLAYWRIGHT_CDP_URL": "qwerty"})
)
assert (
str(exc_info.value)
== "Setting both PLAYWRIGHT_CDP_URL and PLAYWRIGHT_CONNECT_URL is not supported"
)

@allow_windows
async def test_max_contexts(self):
async with make_handler({"PLAYWRIGHT_MAX_CONTEXTS": None}) as handler:
Expand Down
6 changes: 6 additions & 0 deletions tox.ini
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,14 @@ deps =
pytest_cov==4.1.0
pytest_twisted==1.14
psutil==5.9.7
playwright==1.44 # version must match the one installed with npm below
allowlist_externals =
npm
npx
commands =
playwright install
npm install [email protected]
npx playwright install chromium
py.test -vv --reactor=asyncio \
--cov-report=term-missing \
--cov-report=xml:coverage-asyncio.xml \
Expand Down