Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: added lynx browser support #214

Merged
merged 3 commits into from
Oct 28, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 29 additions & 0 deletions gptme/tools/_browser_lynx.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
"""
Browser tool by calling lynx --dump
"""

import subprocess


def read_url(url):
return subprocess.run(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Consider adding error handling for subprocess.run to catch exceptions if the lynx command fails. This will help in debugging and provide meaningful error messages.

["lynx", "--dump", url, "--display_charset=utf-8"], stdout=subprocess.PIPE
).stdout.decode("utf-8")


def search(query, engine="google"):
if engine == "google":
return read_url(f"https://www.google.com/search?q={query}")
elif engine == "duckduckgo":
return read_url(f"https://duckduckgo.com/?q={query}")
raise ValueError(f"Unknown search engine: {engine}")


def test_read_url():
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Consider using a testing framework like unittest or pytest for the test functions instead of print statements. This will make the tests more robust and easier to manage.

print(read_url("https://gptme.org/"))
print(read_url("https://github.com/ErikBjare/gptme/issues/205"))


def test_search():
print(search("Python", "google"))
print(search("Python", "duckduckgo"))
80 changes: 80 additions & 0 deletions gptme/tools/_browser_playwright.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,13 @@
import atexit
import logging
import os
import re
import shutil
import subprocess
import tempfile
import urllib.parse
from dataclasses import dataclass
from pathlib import Path

from playwright.sync_api import (
ElementHandle,
Expand Down Expand Up @@ -48,6 +54,19 @@ def load_page(url: str) -> Page:
return page


def read_url(url: str) -> str:
"""Read the text of a webpage and return the text in Markdown format."""
page = load_page(url)

# Get the HTML of the body
body_html = page.inner_html("body")

# Convert the HTML to Markdown
markdown = html_to_markdown(body_html)

return markdown


def search_google(query: str) -> str:
query = urllib.parse.quote(query)
url = f"https://www.google.com/search?q={query}&hl=en"
Expand Down Expand Up @@ -169,3 +188,64 @@ def _list_results_duckduckgo(page) -> str:
desc = result.query_selector("span").inner_text().strip().split("\n")[0]
hits.append(SearchResult(title, url, desc))
return titleurl_to_list(hits)


def screenshot_url(url: str, path: Path | str | None = None) -> Path:
"""Take a screenshot of a webpage and save it to a file."""
logger.info(f"Taking screenshot of '{url}' and saving to '{path}'")
page = load_page(url)

if path is None:
path = tempfile.mktemp(suffix=".png")
else:
# create the directory if it doesn't exist
os.makedirs(os.path.dirname(path), exist_ok=True)

# Take the screenshot
page.screenshot(path=path)

print(f"Screenshot saved to {path}")
return Path(path)


def html_to_markdown(html):
# check that pandoc is installed
if not shutil.which("pandoc"):
raise Exception("Pandoc is not installed. Needed for browsing.")

p = subprocess.Popen(
["pandoc", "-f", "html", "-t", "markdown"],
stdin=subprocess.PIPE,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
)
stdout, stderr = p.communicate(input=html.encode())

if p.returncode != 0:
raise Exception(f"Pandoc returned error code {p.returncode}: {stderr.decode()}")

# Post-process the output to remove :::
markdown = stdout.decode()
markdown = "\n".join(
line for line in markdown.split("\n") if not line.strip().startswith(":::")
)

# Post-process the output to remove div tags
markdown = markdown.replace("<div>", "").replace("</div>", "")

# replace [\n]{3,} with \n\n
markdown = re.sub(r"[\n]{3,}", "\n\n", markdown)

# replace {...} with ''
markdown = re.sub(r"\{(#|style|target|\.)[^}]*\}", "", markdown)

# strip inline images, like: data:image/png;base64,...
re_strip_data = re.compile(r"!\[[^\]]*\]\(data:image[^)]*\)")

# test cases
assert re_strip_data.sub("", "![test](data:image/png;base64,123)") == ""
assert re_strip_data.sub("", "![test](data:image/png;base64,123) test") == " test"

markdown = re_strip_data.sub("", markdown)

return markdown
121 changes: 37 additions & 84 deletions gptme/tools/browser.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,26 +19,28 @@

import importlib.util
import logging
import os
import re
import shutil
import subprocess
import tempfile
from pathlib import Path
from typing import Literal

from .base import ToolSpec, ToolUse

has_playwright = importlib.util.find_spec("playwright") is not None
has_playwright = lambda: importlib.util.find_spec("playwright") is not None # noqa
has_lynx = lambda: shutil.which("lynx") # noqa
browser: Literal["playwright", "lynx"] | None = (
"playwright" if has_playwright() else ("lynx" if has_lynx() else None)
)

# noreorder
if has_playwright:
from ._browser_playwright import ( # fmt: skip
load_page,
search_duckduckgo,
search_google,
if browser == "playwright":
from ._browser_playwright import read_url as read_url_playwright # fmt: skip
from ._browser_playwright import (
screenshot_url as screenshot_url_playwright, # fmt: skip
)

from ._browser_playwright import search_duckduckgo, search_google # fmt: skip
elif browser == "lynx":
from ._browser_lynx import read_url as read_url_lynx # fmt: skip
from ._browser_lynx import search as search_lynx # fmt: skip

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -83,92 +85,43 @@


def has_browser_tool():
return has_playwright
return browser is not None


def read_url(url: str) -> str:
"""Read the text of a webpage and return the text in Markdown format."""
page = load_page(url)

# Get the HTML of the body
body_html = page.inner_html("body")

# Convert the HTML to Markdown
markdown = html_to_markdown(body_html)

return markdown
"""Read a webpage in a text format."""
assert browser
if browser == "playwright":
return read_url_playwright(url) # type: ignore
elif browser == "lynx":
return read_url_lynx(url) # type: ignore


def search(query: str, engine: EngineType = "google") -> str:
"""Search for a query on a search engine."""
logger.info(f"Searching for '{query}' on {engine}")
if browser == "playwright":
return search_playwright(query, engine)
elif browser == "lynx":
return search_lynx(query, engine) # type: ignore
raise ValueError(f"Unknown search engine: {engine}")


def search_playwright(query: str, engine: EngineType = "google") -> str:
"""Search for a query on a search engine using Playwright."""
if engine == "google":
return search_google(query)
return search_google(query) # type: ignore
elif engine == "duckduckgo":
return search_duckduckgo(query)
else:
raise ValueError(f"Unknown search engine: {engine}")
return search_duckduckgo(query) # type: ignore
raise ValueError(f"Unknown search engine: {engine}")


def screenshot_url(url: str, path: Path | str | None = None) -> Path:
"""Take a screenshot of a webpage and save it to a file."""
logger.info(f"Taking screenshot of '{url}' and saving to '{path}'")
page = load_page(url)

if path is None:
path = tempfile.mktemp(suffix=".png")
else:
# create the directory if it doesn't exist
os.makedirs(os.path.dirname(path), exist_ok=True)

# Take the screenshot
page.screenshot(path=path)

print(f"Screenshot saved to {path}")
return Path(path)


def html_to_markdown(html):
# check that pandoc is installed
if not shutil.which("pandoc"):
raise Exception("Pandoc is not installed. Needed for browsing.")

p = subprocess.Popen(
["pandoc", "-f", "html", "-t", "markdown"],
stdin=subprocess.PIPE,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
)
stdout, stderr = p.communicate(input=html.encode())

if p.returncode != 0:
raise Exception(f"Pandoc returned error code {p.returncode}: {stderr.decode()}")

# Post-process the output to remove :::
markdown = stdout.decode()
markdown = "\n".join(
line for line in markdown.split("\n") if not line.strip().startswith(":::")
)

# Post-process the output to remove div tags
markdown = markdown.replace("<div>", "").replace("</div>", "")

# replace [\n]{3,} with \n\n
markdown = re.sub(r"[\n]{3,}", "\n\n", markdown)

# replace {...} with ''
markdown = re.sub(r"\{(#|style|target|\.)[^}]*\}", "", markdown)

# strip inline images, like: data:image/png;base64,...
re_strip_data = re.compile(r"!\[[^\]]*\]\(data:image[^)]*\)")

# test cases
assert re_strip_data.sub("", "![test](data:image/png;base64,123)") == ""
assert re_strip_data.sub("", "![test](data:image/png;base64,123) test") == " test"

markdown = re_strip_data.sub("", markdown)

return markdown
"""Take a screenshot of a webpage."""
assert browser
if browser == "playwright":
return screenshot_url_playwright(url, path) # type: ignore
raise ValueError("Screenshot not supported with lynx backend")


tool = ToolSpec(
Expand Down
17 changes: 4 additions & 13 deletions tests/test_browser.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,10 @@
import pytest

# TODO: we should also test lynx backend
playwright = pytest.importorskip("playwright")

# noreorder
from gptme.tools.browser import load_page, read_url, search # fmt: skip

# noreorder
from playwright.sync_api import expect # fmt: skip


@pytest.mark.slow
def test_browser():
page = load_page("https://superuserlabs.org")
expect(page.get_by_role("main")).to_contain_text("Erik Bjäreholt")

from gptme.tools.browser import read_url, search # fmt: skip

# FIXME: Broken
# @pytest.mark.slow
Expand All @@ -37,5 +28,5 @@ def test_read_url_with_links():
# check that "Erik Bjäreholt" is present
assert "Erik Bjäreholt" in s

# check that markdown link to activitywatch is present
assert "(https://activitywatch.net/)" in s
# check that link to activitywatch is present
assert "https://activitywatch.net/" in s
Loading