diff --git a/gptme/tools/_browser_lynx.py b/gptme/tools/_browser_lynx.py new file mode 100644 index 00000000..ad470ae3 --- /dev/null +++ b/gptme/tools/_browser_lynx.py @@ -0,0 +1,29 @@ +""" +Browser tool by calling lynx --dump +""" + +import subprocess + + +def read_url(url): + return subprocess.run( + ["lynx", "--dump", url, "--display_charset=utf-8"], stdout=subprocess.PIPE + ).stdout.decode("utf-8") + + +def search(query, engine="google"): + if engine == "google": + return read_url(f"https://www.google.com/search?q={query}") + elif engine == "duckduckgo": + return read_url(f"https://duckduckgo.com/?q={query}") + raise ValueError(f"Unknown search engine: {engine}") + + +def test_read_url(): + print(read_url("https://gptme.org/")) + print(read_url("https://github.com/ErikBjare/gptme/issues/205")) + + +def test_search(): + print(search("Python", "google")) + print(search("Python", "duckduckgo")) diff --git a/gptme/tools/_browser_playwright.py b/gptme/tools/_browser_playwright.py index 2c40ae00..a3827765 100644 --- a/gptme/tools/_browser_playwright.py +++ b/gptme/tools/_browser_playwright.py @@ -1,7 +1,13 @@ import atexit import logging +import os +import re +import shutil +import subprocess +import tempfile import urllib.parse from dataclasses import dataclass +from pathlib import Path from playwright.sync_api import ( ElementHandle, @@ -48,6 +54,19 @@ def load_page(url: str) -> Page: return page +def read_url(url: str) -> str: + """Read the text of a webpage and return the text in Markdown format.""" + page = load_page(url) + + # Get the HTML of the body + body_html = page.inner_html("body") + + # Convert the HTML to Markdown + markdown = html_to_markdown(body_html) + + return markdown + + def search_google(query: str) -> str: query = urllib.parse.quote(query) url = f"https://www.google.com/search?q={query}&hl=en" @@ -169,3 +188,64 @@ def _list_results_duckduckgo(page) -> str: desc = result.query_selector("span").inner_text().strip().split("\n")[0] hits.append(SearchResult(title, url, desc)) return titleurl_to_list(hits) + + +def screenshot_url(url: str, path: Path | str | None = None) -> Path: + """Take a screenshot of a webpage and save it to a file.""" + logger.info(f"Taking screenshot of '{url}' and saving to '{path}'") + page = load_page(url) + + if path is None: + path = tempfile.mktemp(suffix=".png") + else: + # create the directory if it doesn't exist + os.makedirs(os.path.dirname(path), exist_ok=True) + + # Take the screenshot + page.screenshot(path=path) + + print(f"Screenshot saved to {path}") + return Path(path) + + +def html_to_markdown(html): + # check that pandoc is installed + if not shutil.which("pandoc"): + raise Exception("Pandoc is not installed. Needed for browsing.") + + p = subprocess.Popen( + ["pandoc", "-f", "html", "-t", "markdown"], + stdin=subprocess.PIPE, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + ) + stdout, stderr = p.communicate(input=html.encode()) + + if p.returncode != 0: + raise Exception(f"Pandoc returned error code {p.returncode}: {stderr.decode()}") + + # Post-process the output to remove ::: + markdown = stdout.decode() + markdown = "\n".join( + line for line in markdown.split("\n") if not line.strip().startswith(":::") + ) + + # Post-process the output to remove div tags + markdown = markdown.replace("
", "").replace("
", "") + + # replace [\n]{3,} with \n\n + markdown = re.sub(r"[\n]{3,}", "\n\n", markdown) + + # replace {...} with '' + markdown = re.sub(r"\{(#|style|target|\.)[^}]*\}", "", markdown) + + # strip inline images, like: data:image/png;base64,... + re_strip_data = re.compile(r"!\[[^\]]*\]\(data:image[^)]*\)") + + # test cases + assert re_strip_data.sub("", "![test]()") == "" + assert re_strip_data.sub("", "![test]() test") == " test" + + markdown = re_strip_data.sub("", markdown) + + return markdown diff --git a/gptme/tools/browser.py b/gptme/tools/browser.py index 182efd76..fa66c1ab 100644 --- a/gptme/tools/browser.py +++ b/gptme/tools/browser.py @@ -19,26 +19,28 @@ import importlib.util import logging -import os -import re import shutil -import subprocess -import tempfile from pathlib import Path from typing import Literal from .base import ToolSpec, ToolUse -has_playwright = importlib.util.find_spec("playwright") is not None +has_playwright = lambda: importlib.util.find_spec("playwright") is not None # noqa +has_lynx = lambda: shutil.which("lynx") # noqa +browser: Literal["playwright", "lynx"] | None = ( + "playwright" if has_playwright() else ("lynx" if has_lynx() else None) +) # noreorder -if has_playwright: - from ._browser_playwright import ( # fmt: skip - load_page, - search_duckduckgo, - search_google, +if browser == "playwright": + from ._browser_playwright import read_url as read_url_playwright # fmt: skip + from ._browser_playwright import ( + screenshot_url as screenshot_url_playwright, # fmt: skip ) - + from ._browser_playwright import search_duckduckgo, search_google # fmt: skip +elif browser == "lynx": + from ._browser_lynx import read_url as read_url_lynx # fmt: skip + from ._browser_lynx import search as search_lynx # fmt: skip logger = logging.getLogger(__name__) @@ -83,92 +85,43 @@ def has_browser_tool(): - return has_playwright + return browser is not None def read_url(url: str) -> str: - """Read the text of a webpage and return the text in Markdown format.""" - page = load_page(url) - - # Get the HTML of the body - body_html = page.inner_html("body") - - # Convert the HTML to Markdown - markdown = html_to_markdown(body_html) - - return markdown + """Read a webpage in a text format.""" + assert browser + if browser == "playwright": + return read_url_playwright(url) # type: ignore + elif browser == "lynx": + return read_url_lynx(url) # type: ignore def search(query: str, engine: EngineType = "google") -> str: """Search for a query on a search engine.""" logger.info(f"Searching for '{query}' on {engine}") + if browser == "playwright": + return search_playwright(query, engine) + elif browser == "lynx": + return search_lynx(query, engine) # type: ignore + raise ValueError(f"Unknown search engine: {engine}") + + +def search_playwright(query: str, engine: EngineType = "google") -> str: + """Search for a query on a search engine using Playwright.""" if engine == "google": - return search_google(query) + return search_google(query) # type: ignore elif engine == "duckduckgo": - return search_duckduckgo(query) - else: - raise ValueError(f"Unknown search engine: {engine}") + return search_duckduckgo(query) # type: ignore + raise ValueError(f"Unknown search engine: {engine}") def screenshot_url(url: str, path: Path | str | None = None) -> Path: - """Take a screenshot of a webpage and save it to a file.""" - logger.info(f"Taking screenshot of '{url}' and saving to '{path}'") - page = load_page(url) - - if path is None: - path = tempfile.mktemp(suffix=".png") - else: - # create the directory if it doesn't exist - os.makedirs(os.path.dirname(path), exist_ok=True) - - # Take the screenshot - page.screenshot(path=path) - - print(f"Screenshot saved to {path}") - return Path(path) - - -def html_to_markdown(html): - # check that pandoc is installed - if not shutil.which("pandoc"): - raise Exception("Pandoc is not installed. Needed for browsing.") - - p = subprocess.Popen( - ["pandoc", "-f", "html", "-t", "markdown"], - stdin=subprocess.PIPE, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - ) - stdout, stderr = p.communicate(input=html.encode()) - - if p.returncode != 0: - raise Exception(f"Pandoc returned error code {p.returncode}: {stderr.decode()}") - - # Post-process the output to remove ::: - markdown = stdout.decode() - markdown = "\n".join( - line for line in markdown.split("\n") if not line.strip().startswith(":::") - ) - - # Post-process the output to remove div tags - markdown = markdown.replace("
", "").replace("
", "") - - # replace [\n]{3,} with \n\n - markdown = re.sub(r"[\n]{3,}", "\n\n", markdown) - - # replace {...} with '' - markdown = re.sub(r"\{(#|style|target|\.)[^}]*\}", "", markdown) - - # strip inline images, like: data:image/png;base64,... - re_strip_data = re.compile(r"!\[[^\]]*\]\(data:image[^)]*\)") - - # test cases - assert re_strip_data.sub("", "![test]()") == "" - assert re_strip_data.sub("", "![test]() test") == " test" - - markdown = re_strip_data.sub("", markdown) - - return markdown + """Take a screenshot of a webpage.""" + assert browser + if browser == "playwright": + return screenshot_url_playwright(url, path) # type: ignore + raise ValueError("Screenshot not supported with lynx backend") tool = ToolSpec( diff --git a/tests/test_browser.py b/tests/test_browser.py index 6efa2538..84937062 100644 --- a/tests/test_browser.py +++ b/tests/test_browser.py @@ -1,19 +1,10 @@ import pytest +# TODO: we should also test lynx backend playwright = pytest.importorskip("playwright") # noreorder -from gptme.tools.browser import load_page, read_url, search # fmt: skip - -# noreorder -from playwright.sync_api import expect # fmt: skip - - -@pytest.mark.slow -def test_browser(): - page = load_page("https://superuserlabs.org") - expect(page.get_by_role("main")).to_contain_text("Erik Bjäreholt") - +from gptme.tools.browser import read_url, search # fmt: skip # FIXME: Broken # @pytest.mark.slow @@ -37,5 +28,5 @@ def test_read_url_with_links(): # check that "Erik Bjäreholt" is present assert "Erik Bjäreholt" in s - # check that markdown link to activitywatch is present - assert "(https://activitywatch.net/)" in s + # check that link to activitywatch is present + assert "https://activitywatch.net/" in s