diff --git a/gptme/tools/_browser_lynx.py b/gptme/tools/_browser_lynx.py
new file mode 100644
index 00000000..ad470ae3
--- /dev/null
+++ b/gptme/tools/_browser_lynx.py
@@ -0,0 +1,29 @@
+"""
+Browser tool by calling lynx --dump
+"""
+
+import subprocess
+
+
+def read_url(url):
+ return subprocess.run(
+ ["lynx", "--dump", url, "--display_charset=utf-8"], stdout=subprocess.PIPE
+ ).stdout.decode("utf-8")
+
+
+def search(query, engine="google"):
+ if engine == "google":
+ return read_url(f"https://www.google.com/search?q={query}")
+ elif engine == "duckduckgo":
+ return read_url(f"https://duckduckgo.com/?q={query}")
+ raise ValueError(f"Unknown search engine: {engine}")
+
+
+def test_read_url():
+ print(read_url("https://gptme.org/"))
+ print(read_url("https://github.com/ErikBjare/gptme/issues/205"))
+
+
+def test_search():
+ print(search("Python", "google"))
+ print(search("Python", "duckduckgo"))
diff --git a/gptme/tools/_browser_playwright.py b/gptme/tools/_browser_playwright.py
index 2c40ae00..a3827765 100644
--- a/gptme/tools/_browser_playwright.py
+++ b/gptme/tools/_browser_playwright.py
@@ -1,7 +1,13 @@
import atexit
import logging
+import os
+import re
+import shutil
+import subprocess
+import tempfile
import urllib.parse
from dataclasses import dataclass
+from pathlib import Path
from playwright.sync_api import (
ElementHandle,
@@ -48,6 +54,19 @@ def load_page(url: str) -> Page:
return page
+def read_url(url: str) -> str:
+ """Read the text of a webpage and return the text in Markdown format."""
+ page = load_page(url)
+
+ # Get the HTML of the body
+ body_html = page.inner_html("body")
+
+ # Convert the HTML to Markdown
+ markdown = html_to_markdown(body_html)
+
+ return markdown
+
+
def search_google(query: str) -> str:
query = urllib.parse.quote(query)
url = f"https://www.google.com/search?q={query}&hl=en"
@@ -169,3 +188,64 @@ def _list_results_duckduckgo(page) -> str:
desc = result.query_selector("span").inner_text().strip().split("\n")[0]
hits.append(SearchResult(title, url, desc))
return titleurl_to_list(hits)
+
+
+def screenshot_url(url: str, path: Path | str | None = None) -> Path:
+ """Take a screenshot of a webpage and save it to a file."""
+ logger.info(f"Taking screenshot of '{url}' and saving to '{path}'")
+ page = load_page(url)
+
+ if path is None:
+ path = tempfile.mktemp(suffix=".png")
+ else:
+ # create the directory if it doesn't exist
+ os.makedirs(os.path.dirname(path), exist_ok=True)
+
+ # Take the screenshot
+ page.screenshot(path=path)
+
+ print(f"Screenshot saved to {path}")
+ return Path(path)
+
+
+def html_to_markdown(html):
+ # check that pandoc is installed
+ if not shutil.which("pandoc"):
+ raise Exception("Pandoc is not installed. Needed for browsing.")
+
+ p = subprocess.Popen(
+ ["pandoc", "-f", "html", "-t", "markdown"],
+ stdin=subprocess.PIPE,
+ stdout=subprocess.PIPE,
+ stderr=subprocess.PIPE,
+ )
+ stdout, stderr = p.communicate(input=html.encode())
+
+ if p.returncode != 0:
+ raise Exception(f"Pandoc returned error code {p.returncode}: {stderr.decode()}")
+
+ # Post-process the output to remove :::
+ markdown = stdout.decode()
+ markdown = "\n".join(
+ line for line in markdown.split("\n") if not line.strip().startswith(":::")
+ )
+
+ # Post-process the output to remove div tags
+ markdown = markdown.replace("
", "").replace("
", "")
+
+ # replace [\n]{3,} with \n\n
+ markdown = re.sub(r"[\n]{3,}", "\n\n", markdown)
+
+ # replace {...} with ''
+ markdown = re.sub(r"\{(#|style|target|\.)[^}]*\}", "", markdown)
+
+ # strip inline images, like: data:image/png;base64,...
+ re_strip_data = re.compile(r"!\[[^\]]*\]\(data:image[^)]*\)")
+
+ # test cases
+ assert re_strip_data.sub("", "![test](data:image/png;base64,123)") == ""
+ assert re_strip_data.sub("", "![test](data:image/png;base64,123) test") == " test"
+
+ markdown = re_strip_data.sub("", markdown)
+
+ return markdown
diff --git a/gptme/tools/browser.py b/gptme/tools/browser.py
index 182efd76..fa66c1ab 100644
--- a/gptme/tools/browser.py
+++ b/gptme/tools/browser.py
@@ -19,26 +19,28 @@
import importlib.util
import logging
-import os
-import re
import shutil
-import subprocess
-import tempfile
from pathlib import Path
from typing import Literal
from .base import ToolSpec, ToolUse
-has_playwright = importlib.util.find_spec("playwright") is not None
+has_playwright = lambda: importlib.util.find_spec("playwright") is not None # noqa
+has_lynx = lambda: shutil.which("lynx") # noqa
+browser: Literal["playwright", "lynx"] | None = (
+ "playwright" if has_playwright() else ("lynx" if has_lynx() else None)
+)
# noreorder
-if has_playwright:
- from ._browser_playwright import ( # fmt: skip
- load_page,
- search_duckduckgo,
- search_google,
+if browser == "playwright":
+ from ._browser_playwright import read_url as read_url_playwright # fmt: skip
+ from ._browser_playwright import (
+ screenshot_url as screenshot_url_playwright, # fmt: skip
)
-
+ from ._browser_playwright import search_duckduckgo, search_google # fmt: skip
+elif browser == "lynx":
+ from ._browser_lynx import read_url as read_url_lynx # fmt: skip
+ from ._browser_lynx import search as search_lynx # fmt: skip
logger = logging.getLogger(__name__)
@@ -83,92 +85,43 @@
def has_browser_tool():
- return has_playwright
+ return browser is not None
def read_url(url: str) -> str:
- """Read the text of a webpage and return the text in Markdown format."""
- page = load_page(url)
-
- # Get the HTML of the body
- body_html = page.inner_html("body")
-
- # Convert the HTML to Markdown
- markdown = html_to_markdown(body_html)
-
- return markdown
+ """Read a webpage in a text format."""
+ assert browser
+ if browser == "playwright":
+ return read_url_playwright(url) # type: ignore
+ elif browser == "lynx":
+ return read_url_lynx(url) # type: ignore
def search(query: str, engine: EngineType = "google") -> str:
"""Search for a query on a search engine."""
logger.info(f"Searching for '{query}' on {engine}")
+ if browser == "playwright":
+ return search_playwright(query, engine)
+ elif browser == "lynx":
+ return search_lynx(query, engine) # type: ignore
+ raise ValueError(f"Unknown search engine: {engine}")
+
+
+def search_playwright(query: str, engine: EngineType = "google") -> str:
+ """Search for a query on a search engine using Playwright."""
if engine == "google":
- return search_google(query)
+ return search_google(query) # type: ignore
elif engine == "duckduckgo":
- return search_duckduckgo(query)
- else:
- raise ValueError(f"Unknown search engine: {engine}")
+ return search_duckduckgo(query) # type: ignore
+ raise ValueError(f"Unknown search engine: {engine}")
def screenshot_url(url: str, path: Path | str | None = None) -> Path:
- """Take a screenshot of a webpage and save it to a file."""
- logger.info(f"Taking screenshot of '{url}' and saving to '{path}'")
- page = load_page(url)
-
- if path is None:
- path = tempfile.mktemp(suffix=".png")
- else:
- # create the directory if it doesn't exist
- os.makedirs(os.path.dirname(path), exist_ok=True)
-
- # Take the screenshot
- page.screenshot(path=path)
-
- print(f"Screenshot saved to {path}")
- return Path(path)
-
-
-def html_to_markdown(html):
- # check that pandoc is installed
- if not shutil.which("pandoc"):
- raise Exception("Pandoc is not installed. Needed for browsing.")
-
- p = subprocess.Popen(
- ["pandoc", "-f", "html", "-t", "markdown"],
- stdin=subprocess.PIPE,
- stdout=subprocess.PIPE,
- stderr=subprocess.PIPE,
- )
- stdout, stderr = p.communicate(input=html.encode())
-
- if p.returncode != 0:
- raise Exception(f"Pandoc returned error code {p.returncode}: {stderr.decode()}")
-
- # Post-process the output to remove :::
- markdown = stdout.decode()
- markdown = "\n".join(
- line for line in markdown.split("\n") if not line.strip().startswith(":::")
- )
-
- # Post-process the output to remove div tags
- markdown = markdown.replace("", "").replace("
", "")
-
- # replace [\n]{3,} with \n\n
- markdown = re.sub(r"[\n]{3,}", "\n\n", markdown)
-
- # replace {...} with ''
- markdown = re.sub(r"\{(#|style|target|\.)[^}]*\}", "", markdown)
-
- # strip inline images, like: data:image/png;base64,...
- re_strip_data = re.compile(r"!\[[^\]]*\]\(data:image[^)]*\)")
-
- # test cases
- assert re_strip_data.sub("", "![test](data:image/png;base64,123)") == ""
- assert re_strip_data.sub("", "![test](data:image/png;base64,123) test") == " test"
-
- markdown = re_strip_data.sub("", markdown)
-
- return markdown
+ """Take a screenshot of a webpage."""
+ assert browser
+ if browser == "playwright":
+ return screenshot_url_playwright(url, path) # type: ignore
+ raise ValueError("Screenshot not supported with lynx backend")
tool = ToolSpec(
diff --git a/tests/test_browser.py b/tests/test_browser.py
index 6efa2538..84937062 100644
--- a/tests/test_browser.py
+++ b/tests/test_browser.py
@@ -1,19 +1,10 @@
import pytest
+# TODO: we should also test lynx backend
playwright = pytest.importorskip("playwright")
# noreorder
-from gptme.tools.browser import load_page, read_url, search # fmt: skip
-
-# noreorder
-from playwright.sync_api import expect # fmt: skip
-
-
-@pytest.mark.slow
-def test_browser():
- page = load_page("https://superuserlabs.org")
- expect(page.get_by_role("main")).to_contain_text("Erik Bjäreholt")
-
+from gptme.tools.browser import read_url, search # fmt: skip
# FIXME: Broken
# @pytest.mark.slow
@@ -37,5 +28,5 @@ def test_read_url_with_links():
# check that "Erik Bjäreholt" is present
assert "Erik Bjäreholt" in s
- # check that markdown link to activitywatch is present
- assert "(https://activitywatch.net/)" in s
+ # check that link to activitywatch is present
+ assert "https://activitywatch.net/" in s