From b350ef94cc57122bf952ccc94d31f273cf5fa8f6 Mon Sep 17 00:00:00 2001 From: Eugenio Lacuesta Date: Mon, 1 Jul 2024 01:43:06 -0300 Subject: [PATCH 01/11] Connect to remote browser with BrowserType.connect --- scrapy_playwright/handler.py | 28 ++++++++++++++++++++++++++-- tests/tests_asyncio/test_remote.py | 2 +- 2 files changed, 27 insertions(+), 3 deletions(-) diff --git a/scrapy_playwright/handler.py b/scrapy_playwright/handler.py index 9096cbb9..ee5b5d44 100644 --- a/scrapy_playwright/handler.py +++ b/scrapy_playwright/handler.py @@ -22,6 +22,7 @@ from scrapy import Spider, signals from scrapy.core.downloader.handlers.http import HTTPDownloadHandler from scrapy.crawler import Crawler +from scrapy.exceptions import NotSupported from scrapy.http import Request, Response from scrapy.http.headers import Headers from scrapy.responsetypes import responsetypes @@ -67,6 +68,8 @@ class BrowserContextWrapper: class Config: cdp_url: Optional[str] cdp_kwargs: dict + connect_url: Optional[str] + connect_kwargs: dict browser_type_name: str launch_options: dict max_pages_per_context: int @@ -76,9 +79,15 @@ class Config: @classmethod def from_settings(cls, settings: Settings) -> "Config": + if settings.get("PLAYWRIGHT_CDP_URL") and settings.get("PLAYWRIGHT_CONNECT_URL"): + msg = "Setting both PLAYWRIGHT_CDP_URL and PLAYWRIGHT_CONNECT_URL is not supported" + logger.error(msg) + raise NotSupported(msg) cfg = cls( cdp_url=settings.get("PLAYWRIGHT_CDP_URL"), cdp_kwargs=settings.getdict("PLAYWRIGHT_CDP_KWARGS") or {}, + connect_url=settings.get("PLAYWRIGHT_CONNECT_URL"), + connect_kwargs=settings.getdict("PLAYWRIGHT_CONNECT_KWARGS") or {}, browser_type_name=settings.get("PLAYWRIGHT_BROWSER_TYPE") or DEFAULT_BROWSER_TYPE, launch_options=settings.getdict("PLAYWRIGHT_LAUNCH_OPTIONS") or {}, max_pages_per_context=settings.getint("PLAYWRIGHT_MAX_PAGES_PER_CONTEXT"), @@ -86,10 +95,11 @@ def from_settings(cls, settings: Settings) -> "Config": startup_context_kwargs=settings.getdict("PLAYWRIGHT_CONTEXTS"), ) cfg.cdp_kwargs.pop("endpoint_url", None) + cfg.connect_kwargs.pop("ws_endpoint", None) if not cfg.max_pages_per_context: cfg.max_pages_per_context = settings.getint("CONCURRENT_REQUESTS") - if cfg.cdp_url and cfg.launch_options: - logger.warning("PLAYWRIGHT_CDP_URL is set, ignoring PLAYWRIGHT_LAUNCH_OPTIONS") + if (cfg.cdp_url or cfg.connect_url) and cfg.launch_options: + logger.warning("Connecting to remote browser, ignoring PLAYWRIGHT_LAUNCH_OPTIONS") if "PLAYWRIGHT_DEFAULT_NAVIGATION_TIMEOUT" in settings: with suppress(TypeError, ValueError): cfg.navigation_timeout = float(settings["PLAYWRIGHT_DEFAULT_NAVIGATION_TIMEOUT"]) @@ -172,6 +182,15 @@ async def _maybe_connect_devtools(self) -> None: ) logger.info("Connected using CDP: %s", self.config.cdp_url) + async def _maybe_connect_remote(self) -> None: + async with self.browser_launch_lock: + if not hasattr(self, "browser"): + logger.info("Connecting to remote Playwright: %s", self.config.connect_url) + self.browser = await self.browser_type.connect( + self.config.connect_url, **self.config.connect_kwargs + ) + logger.info("Connected to remote Playwright: %s", self.config.connect_kwargs) + async def _create_browser_context( self, name: str, @@ -193,6 +212,11 @@ async def _create_browser_context( context = await self.browser.new_context(**context_kwargs) persistent = False remote = True + elif self.config.connect_url: + await self._maybe_connect_remote() + context = await self.browser.new_context(**context_kwargs) + persistent = False + remote = True else: await self._maybe_launch_browser() context = await self.browser.new_context(**context_kwargs) diff --git a/tests/tests_asyncio/test_remote.py b/tests/tests_asyncio/test_remote.py index 174c9090..586d2f44 100644 --- a/tests/tests_asyncio/test_remote.py +++ b/tests/tests_asyncio/test_remote.py @@ -76,5 +76,5 @@ async def test_devtools(self): assert ( "scrapy-playwright", logging.WARNING, - "PLAYWRIGHT_CDP_URL is set, ignoring PLAYWRIGHT_LAUNCH_OPTIONS", + "Connecting to remote browser, ignoring PLAYWRIGHT_LAUNCH_OPTIONS", ) in self._caplog.record_tuples From 20d32e40e98ab5122fe9041d374cb58f1058e771 Mon Sep 17 00:00:00 2001 From: Eugenio Lacuesta Date: Mon, 1 Jul 2024 01:44:42 -0300 Subject: [PATCH 02/11] Docs for PLAYWRIGHT_CONNECT_URL & PLAYWRIGHT_CONNECT_KWARGS --- README.md | 44 +++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 41 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 1ffcfa5c..19b2a919 100644 --- a/README.md +++ b/README.md @@ -168,14 +168,17 @@ Type `Optional[str]`, default `None` The endpoint of a remote Chromium browser to connect using the [Chrome DevTools Protocol](https://chromedevtools.github.io/devtools-protocol/), via [`BrowserType.connect_over_cdp`](https://playwright.dev/python/docs/api/class-browsertype#browser-type-connect-over-cdp). + +```python +PLAYWRIGHT_CDP_URL = "http://localhost:9222" +``` + If this setting is used: * all non-persistent contexts will be created on the connected remote browser * the `PLAYWRIGHT_LAUNCH_OPTIONS` setting is ignored * the `PLAYWRIGHT_BROWSER_TYPE` setting must not be set to a value different than "chromium" -```python -PLAYWRIGHT_CDP_URL = "http://localhost:9222" -``` +**This settings CANNOT be used at the same time as `PLAYWRIGHT_CONNECT_URL`** ### `PLAYWRIGHT_CDP_KWARGS` Type `dict[str, Any]`, default `{}` @@ -192,6 +195,41 @@ PLAYWRIGHT_CDP_KWARGS = { } ``` +### `PLAYWRIGHT_CONNECT_URL` +Type `Optional[str]`, default `None` + +URL of a remote Playwright browser instance to connect using +[`BrowserType.connect`](https://playwright.dev/python/docs/api/class-browsertype#browser-type-connect). + +From the upstream Playwright docs: +> When connecting to another browser launched via +> [`BrowserType.launchServer`](https://playwright.dev/docs/api/class-browsertype#browser-type-launch-server) +> in Node.js, the major and minor version needs to match the client version (1.2.3 → is compatible with 1.2.x). + +```python +PLAYWRIGHT_CONNECT_URL = "ws://localhost:35477/ae1fa0bc325adcfd9600d9f712e9c733" +``` + +If this setting is used: +* all non-persistent contexts will be created on the connected remote browser +* the `PLAYWRIGHT_LAUNCH_OPTIONS` setting is ignored + +**This settings CANNOT be used at the same time as `PLAYWRIGHT_CDP_URL`** + +### `PLAYWRIGHT_CONNECT_KWARGS` +Type `dict[str, Any]`, default `{}` + +Additional keyword arguments to be passed to +[`BrowserType.connect`](https://playwright.dev/python/docs/api/class-browsertype#browser-type-connect) +when using `PLAYWRIGHT_CONNECT_URL`. The `ws_endpoint` key is always ignored, +`PLAYWRIGHT_CONNECT_URL` is used instead. + +```python +PLAYWRIGHT_CONNECT_KWARGS = { + "slow_mo": 1000, + "timeout": 10 * 1000 +} +``` ### `PLAYWRIGHT_CONTEXTS` Type `dict[str, dict]`, default `{}` From 65b7593ad09b3d7907f19ee9247315d2767704c0 Mon Sep 17 00:00:00 2001 From: Eugenio Lacuesta Date: Mon, 1 Jul 2024 01:59:17 -0300 Subject: [PATCH 03/11] Test error when setting PLAYWRIGHT_CDP_URL and PLAYWRIGHT_CONNECT_URL --- tests/tests_asyncio/test_settings.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/tests/tests_asyncio/test_settings.py b/tests/tests_asyncio/test_settings.py index 3dbabc65..84c13038 100644 --- a/tests/tests_asyncio/test_settings.py +++ b/tests/tests_asyncio/test_settings.py @@ -1,5 +1,7 @@ from unittest import IsolatedAsyncioTestCase +import pytest +from scrapy.exceptions import NotSupported from scrapy.settings import Settings from scrapy_playwright.handler import Config @@ -31,6 +33,16 @@ async def test_max_pages_per_context(self): config = Config.from_settings(Settings({"CONCURRENT_REQUESTS": 9876})) assert config.max_pages_per_context == 9876 + async def test_connect_remote_urls(self): + with pytest.raises(NotSupported) as exc_info: + Config.from_settings( + Settings({"PLAYWRIGHT_CONNECT_URL": "asdf", "PLAYWRIGHT_CDP_URL": "qwerty"}) + ) + assert ( + str(exc_info.value) + == "Setting both PLAYWRIGHT_CDP_URL and PLAYWRIGHT_CONNECT_URL is not supported" + ) + @allow_windows async def test_max_contexts(self): async with make_handler({"PLAYWRIGHT_MAX_CONTEXTS": None}) as handler: From b84921767ddd253e0f26254ca84b336d8b943352 Mon Sep 17 00:00:00 2001 From: Eugenio Lacuesta Date: Mon, 1 Jul 2024 02:11:38 -0300 Subject: [PATCH 04/11] Rename method, simplify variables --- scrapy_playwright/handler.py | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/scrapy_playwright/handler.py b/scrapy_playwright/handler.py index ee5b5d44..6cd2e04d 100644 --- a/scrapy_playwright/handler.py +++ b/scrapy_playwright/handler.py @@ -173,7 +173,7 @@ async def _maybe_launch_browser(self) -> None: self.browser = await self.browser_type.launch(**self.config.launch_options) logger.info("Browser %s launched", self.browser_type.name) - async def _maybe_connect_devtools(self) -> None: + async def _maybe_connect_remote_devtools(self) -> None: async with self.browser_launch_lock: if not hasattr(self, "browser"): logger.info("Connecting using CDP: %s", self.config.cdp_url) @@ -203,25 +203,21 @@ async def _create_browser_context( if hasattr(self, "context_semaphore"): await self.context_semaphore.acquire() context_kwargs = context_kwargs or {} + persistent = remote = False if context_kwargs.get(PERSISTENT_CONTEXT_PATH_KEY): context = await self.browser_type.launch_persistent_context(**context_kwargs) persistent = True - remote = False elif self.config.cdp_url: - await self._maybe_connect_devtools() + await self._maybe_connect_remote_devtools() context = await self.browser.new_context(**context_kwargs) - persistent = False remote = True elif self.config.connect_url: await self._maybe_connect_remote() context = await self.browser.new_context(**context_kwargs) - persistent = False remote = True else: await self._maybe_launch_browser() context = await self.browser.new_context(**context_kwargs) - persistent = False - remote = False context.on( "close", self._make_close_browser_context_callback(name, persistent, remote, spider) From 9f05b0663e313ad372bc1901a9fc5a82d6a9c9e8 Mon Sep 17 00:00:00 2001 From: Eugenio Lacuesta Date: Tue, 2 Jul 2024 09:55:55 -0300 Subject: [PATCH 05/11] Test PLAYWRIGHT_CONNECT_URL --- scrapy_playwright/handler.py | 4 +- tests/launch_browser_server.js | 9 ++++ tests/tests_asyncio/test_remote.py | 71 +++++++++++++++++++++++++----- 3 files changed, 71 insertions(+), 13 deletions(-) create mode 100644 tests/launch_browser_server.js diff --git a/scrapy_playwright/handler.py b/scrapy_playwright/handler.py index 6cd2e04d..4d1f44e8 100644 --- a/scrapy_playwright/handler.py +++ b/scrapy_playwright/handler.py @@ -185,11 +185,11 @@ async def _maybe_connect_remote_devtools(self) -> None: async def _maybe_connect_remote(self) -> None: async with self.browser_launch_lock: if not hasattr(self, "browser"): - logger.info("Connecting to remote Playwright: %s", self.config.connect_url) + logger.info("Connecting to remote Playwright") self.browser = await self.browser_type.connect( self.config.connect_url, **self.config.connect_kwargs ) - logger.info("Connected to remote Playwright: %s", self.config.connect_kwargs) + logger.info("Connected to remote Playwright") async def _create_browser_context( self, diff --git a/tests/launch_browser_server.js b/tests/launch_browser_server.js new file mode 100644 index 00000000..bdfc4ce7 --- /dev/null +++ b/tests/launch_browser_server.js @@ -0,0 +1,9 @@ +const { chromium } = require('playwright'); // Or 'webkit' or 'firefox'. + +(async () => { + const browserServer = await chromium.launchServer({ + host: 'localhost', + port: process.argv[2], + wsPath: process.argv[3] + }); +})(); diff --git a/tests/tests_asyncio/test_remote.py b/tests/tests_asyncio/test_remote.py index 586d2f44..6adebe52 100644 --- a/tests/tests_asyncio/test_remote.py +++ b/tests/tests_asyncio/test_remote.py @@ -1,8 +1,12 @@ +import asyncio import logging +import random import re import subprocess import time +import uuid from contextlib import asynccontextmanager +from pathlib import Path from typing import Tuple from unittest import IsolatedAsyncioTestCase @@ -14,8 +18,8 @@ from tests.mockserver import StaticMockServer -async def _run_chromium() -> Tuple[subprocess.Popen, str]: - """Run a Croumium instance in a separate process, return the process +async def _run_chromium_devtools() -> Tuple[subprocess.Popen, str]: + """Run a Chromium instance in a separate process, return the process object and a string with its devtools endpoint. """ async with async_playwright() as playwright: @@ -38,32 +42,50 @@ async def _run_chromium() -> Tuple[subprocess.Popen, str]: return proc, devtools_url +def _run_playwright_browser_server() -> Tuple[subprocess.Popen, str]: + """Start a Playwright server in a separate process, return the process + object and a string with its websocket endpoint. + Pass fixed port and ws path via env variables instead of allowing Playwright + to choose, for some reason I was unable to capture stdout/stderr :shrug: + """ + port = str(random.randint(60_000, 63_000)) + ws_path = str(uuid.uuid4()) + launch_server_script_path = str(Path(__file__).parent.parent / "launch_browser_server.js") + command = ["node", launch_server_script_path, port, ws_path] + proc = subprocess.Popen(command) # pylint: disable=consider-using-with + return proc, f"ws:/localhost:{port}/{ws_path}" + + @asynccontextmanager -async def remote_chromium(): - """Launch a Chromium instance with remote debugging enabled.""" - proc = None - devtools_url = None +async def remote_browser(is_chrome_devtools_protocol: bool = True): + """Launch a remote browser that lasts while in the context.""" + proc = url = None try: - proc, devtools_url = await _run_chromium() + if is_chrome_devtools_protocol: + proc, url = await _run_chromium_devtools() + else: + proc, url = _run_playwright_browser_server() + await asyncio.sleep(1) # allow some time for the browser to start except Exception: pass else: - yield devtools_url + print(f"Browser URL: {url}") + yield url finally: if proc: proc.kill() proc.communicate() -class TestRemoteDevtools(IsolatedAsyncioTestCase): +class TestRemote(IsolatedAsyncioTestCase): @pytest.fixture(autouse=True) def inject_fixtures(self, caplog): caplog.set_level(logging.DEBUG) self._caplog = caplog @allow_windows - async def test_devtools(self): - async with remote_chromium() as devtools_url: + async def test_connect_devtools(self): + async with remote_browser(is_chrome_devtools_protocol=True) as devtools_url: settings_dict = { "PLAYWRIGHT_CDP_URL": devtools_url, "PLAYWRIGHT_LAUNCH_OPTIONS": {"headless": True}, @@ -78,3 +100,30 @@ async def test_devtools(self): logging.WARNING, "Connecting to remote browser, ignoring PLAYWRIGHT_LAUNCH_OPTIONS", ) in self._caplog.record_tuples + + async def test_connect(self): + async with remote_browser(is_chrome_devtools_protocol=False) as browser_url: + settings_dict = { + "PLAYWRIGHT_CONNECT_URL": browser_url, + "PLAYWRIGHT_LAUNCH_OPTIONS": {"headless": True}, + } + async with make_handler(settings_dict) as handler: + with StaticMockServer() as server: + req = Request(server.urljoin("/index.html"), meta={"playwright": True}) + resp = await handler._download_request(req, Spider("foo")) + assert_correct_response(resp, req) + assert ( + "scrapy-playwright", + logging.INFO, + "Connecting to remote Playwright", + ) in self._caplog.record_tuples + assert ( + "scrapy-playwright", + logging.INFO, + "Connected to remote Playwright", + ) in self._caplog.record_tuples + assert ( + "scrapy-playwright", + logging.WARNING, + "Connecting to remote browser, ignoring PLAYWRIGHT_LAUNCH_OPTIONS", + ) in self._caplog.record_tuples From 8d899bebdc33160ea23cf7fb8d9c63ee12e97a85 Mon Sep 17 00:00:00 2001 From: Eugenio Lacuesta Date: Tue, 2 Jul 2024 10:07:01 -0300 Subject: [PATCH 06/11] Setup node for tests --- .github/workflows/tests.yml | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index e6fcca31..049a7382 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -24,6 +24,16 @@ jobs: with: python-version: ${{ matrix.python-version }} + - name: Set up node + uses: actions/setup-node@v4 + with: + node-version: 18 + + - name: Install Playwright & Chromium Browser + run: | + npm install playwright@latest + npx playwright install chromium --with-deps + - name: Install tox run: pip install tox From 7c4cb0ee4437a44f807c7f5e77bb285ebd9b71da Mon Sep 17 00:00:00 2001 From: Eugenio Lacuesta Date: Tue, 2 Jul 2024 10:18:47 -0300 Subject: [PATCH 07/11] Test connect on Windows --- tests/tests_asyncio/test_remote.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/tests_asyncio/test_remote.py b/tests/tests_asyncio/test_remote.py index 6adebe52..14eb1f0d 100644 --- a/tests/tests_asyncio/test_remote.py +++ b/tests/tests_asyncio/test_remote.py @@ -101,6 +101,7 @@ async def test_connect_devtools(self): "Connecting to remote browser, ignoring PLAYWRIGHT_LAUNCH_OPTIONS", ) in self._caplog.record_tuples + @allow_windows async def test_connect(self): async with remote_browser(is_chrome_devtools_protocol=False) as browser_url: settings_dict = { From 3d06e44dbc1d0fa4e9697b4f4b0da3335ff64244 Mon Sep 17 00:00:00 2001 From: Eugenio Lacuesta Date: Tue, 2 Jul 2024 10:24:23 -0300 Subject: [PATCH 08/11] Match python & nodejs playwright versions --- .github/workflows/tests.yml | 3 ++- tox.ini | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 049a7382..742ff64a 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -29,9 +29,10 @@ jobs: with: node-version: 18 + # version must match the one pinned in tox.ini - name: Install Playwright & Chromium Browser run: | - npm install playwright@latest + npm install playwright@1.44 npx playwright install chromium --with-deps - name: Install tox diff --git a/tox.ini b/tox.ini index fbe9ceca..8910f6e9 100644 --- a/tox.ini +++ b/tox.ini @@ -7,6 +7,7 @@ deps = pytest_cov==4.1.0 pytest_twisted==1.14 psutil==5.9.7 + playwright==1.44 # version must match the nodejs version in the CI config commands = playwright install py.test -vv --reactor=asyncio \ From daabb4be9c549f7ced389d9d1662f7ee735bfd92 Mon Sep 17 00:00:00 2001 From: Eugenio Lacuesta Date: Tue, 2 Jul 2024 10:40:52 -0300 Subject: [PATCH 09/11] Adjustments --- tests/launch_browser_server.js | 4 ++++ tests/tests_asyncio/test_remote.py | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/tests/launch_browser_server.js b/tests/launch_browser_server.js index bdfc4ce7..92447f00 100644 --- a/tests/launch_browser_server.js +++ b/tests/launch_browser_server.js @@ -1,3 +1,7 @@ +// used to start a browser server to test the PLAYWRIGHT_CONNECT_URL setting +// usage: +// node launch_browser_server.js PORT WS_PATH + const { chromium } = require('playwright'); // Or 'webkit' or 'firefox'. (async () => { diff --git a/tests/tests_asyncio/test_remote.py b/tests/tests_asyncio/test_remote.py index 14eb1f0d..84b2c89d 100644 --- a/tests/tests_asyncio/test_remote.py +++ b/tests/tests_asyncio/test_remote.py @@ -45,7 +45,7 @@ async def _run_chromium_devtools() -> Tuple[subprocess.Popen, str]: def _run_playwright_browser_server() -> Tuple[subprocess.Popen, str]: """Start a Playwright server in a separate process, return the process object and a string with its websocket endpoint. - Pass fixed port and ws path via env variables instead of allowing Playwright + Pass fixed port and ws path as arguments instead of allowing Playwright to choose, for some reason I was unable to capture stdout/stderr :shrug: """ port = str(random.randint(60_000, 63_000)) From b738a1a4d2bc40e12d02d9e9644428cfb73cac40 Mon Sep 17 00:00:00 2001 From: Eugenio Lacuesta Date: Wed, 3 Jul 2024 20:47:55 -0300 Subject: [PATCH 10/11] Update tox.ini --- tox.ini | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tox.ini b/tox.ini index 1137046f..6877b8a1 100644 --- a/tox.ini +++ b/tox.ini @@ -8,7 +8,7 @@ deps = pytest_twisted==1.14 psutil==5.9.7 playwright==1.44 # version must match the one installed with npm below -whitelist_externals = +allowlist_externals = npm npx commands = From 2fc8fba378cbe0165cdd57a226bf657e79e0c1d2 Mon Sep 17 00:00:00 2001 From: Eugenio Lacuesta Date: Wed, 3 Jul 2024 22:23:54 -0300 Subject: [PATCH 11/11] Double slash --- tests/tests_asyncio/test_remote.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/tests_asyncio/test_remote.py b/tests/tests_asyncio/test_remote.py index 84b2c89d..69502c0d 100644 --- a/tests/tests_asyncio/test_remote.py +++ b/tests/tests_asyncio/test_remote.py @@ -53,7 +53,7 @@ def _run_playwright_browser_server() -> Tuple[subprocess.Popen, str]: launch_server_script_path = str(Path(__file__).parent.parent / "launch_browser_server.js") command = ["node", launch_server_script_path, port, ws_path] proc = subprocess.Popen(command) # pylint: disable=consider-using-with - return proc, f"ws:/localhost:{port}/{ws_path}" + return proc, f"ws://localhost:{port}/{ws_path}" @asynccontextmanager