From 175167e091b74fc435beaefde77bd30c88499272 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Erik=20Bj=C3=A4reholt?= Date: Fri, 1 Nov 2024 13:45:40 +0100 Subject: [PATCH] feat: implement anthropic-style computer tool (#225) * feat: started working on anthropic-style computer tool * Apply suggestions from code review Co-authored-by: ellipsis-dev[bot] <65095814+ellipsis-dev[bot]@users.noreply.github.com> * fix: progress on computer use * fix: added Dockerfile.server * fix: fixed vnc in computer use webui * docs: fixed docs for computer use * fix: rewrote computer_action function to not be a generator * docs: fixed server docs for computer use * docs: refactored computer use warning into seperate file * fix: optimized Dockerfile.computer for faster rebuilds * fix: refactor and misc fixes to computer use * fix: enable select tools in computer use context * fix: multiple fixes to computer use and web ui * fix: disable computer tool unless explicitly enabled * fix: removed deleted file from .dockerignore * docs: minor fix to computer use docs --------- Co-authored-by: ellipsis-dev[bot] <65095814+ellipsis-dev[bot]@users.noreply.github.com> --- .dockerignore | 1 + Makefile | 5 + docs/computer-use-warning.rst | 4 + docs/conf.py | 1 + docs/contributing.rst | 4 +- docs/examples.rst | 27 +++ docs/server.rst | 47 +++- docs/tools.rst | 36 +++ gptme/server/api.py | 12 + gptme/server/cli.py | 28 ++- gptme/server/static/computer.html | 92 ++++++++ gptme/server/static/index.html | 3 +- gptme/server/static/main.js | 67 ++++-- gptme/tools/__init__.py | 13 +- gptme/tools/computer.py | 219 ++++++++++++++++++ gptme/tools/python.py | 4 +- gptme/tools/screenshot.py | 2 +- gptme/tools/vision.py | 2 + poetry.lock | 19 +- pyproject.toml | 4 + scripts/Dockerfile | 13 -- scripts/Dockerfile.computer | 102 ++++++++ scripts/Dockerfile.server | 23 ++ .../tint2/applications/firefox-custom.desktop | 8 + .../.config/tint2/applications/gedit.desktop | 8 + .../tint2/applications/terminal.desktop | 8 + scripts/computer_home/.config/tint2/tint2rc | 100 ++++++++ scripts/computer_home/entrypoint.sh | 11 + scripts/computer_home/mutter_startup.sh | 22 ++ scripts/computer_home/novnc_startup.sh | 29 +++ scripts/computer_home/start_all.sh | 8 + scripts/computer_home/tint2_startup.sh | 22 ++ scripts/computer_home/x11vnc_startup.sh | 45 ++++ scripts/computer_home/xvfb_startup.sh | 23 ++ 34 files changed, 965 insertions(+), 47 deletions(-) create mode 100644 docs/computer-use-warning.rst create mode 100644 gptme/server/static/computer.html create mode 100644 gptme/tools/computer.py create mode 100644 scripts/Dockerfile.computer create mode 100644 scripts/Dockerfile.server create mode 100755 scripts/computer_home/.config/tint2/applications/firefox-custom.desktop create mode 100755 scripts/computer_home/.config/tint2/applications/gedit.desktop create mode 100644 scripts/computer_home/.config/tint2/applications/terminal.desktop create mode 100644 scripts/computer_home/.config/tint2/tint2rc create mode 100755 scripts/computer_home/entrypoint.sh create mode 100755 scripts/computer_home/mutter_startup.sh create mode 100755 scripts/computer_home/novnc_startup.sh create mode 100755 scripts/computer_home/start_all.sh create mode 100755 scripts/computer_home/tint2_startup.sh create mode 100755 scripts/computer_home/x11vnc_startup.sh create mode 100755 scripts/computer_home/xvfb_startup.sh diff --git a/.dockerignore b/.dockerignore index f3aff7dc..3fa7309b 100644 --- a/.dockerignore +++ b/.dockerignore @@ -10,6 +10,7 @@ gptme.toml # Build scripts scripts +!scripts/computer_home .github # Build/test/coverage/docs/prof directories diff --git a/Makefile b/Makefile index 57464325..23a3ff5d 100644 --- a/Makefile +++ b/Makefile @@ -16,7 +16,12 @@ build: build-docker: docker build . -t gptme:latest -f scripts/Dockerfile + docker build . -t gptme-server:latest -f scripts/Dockerfile.server docker build . -t gptme-eval:latest -f scripts/Dockerfile.eval + # docker build . -t gptme-eval:latest -f scripts/Dockerfile.eval --build-arg RUST=yes --build-arg BROWSER=yes + +build-docker-computer: + docker build . -t gptme-computer:latest -f scripts/Dockerfile.computer build-docker-dev: docker build . -t gptme-dev:latest -f scripts/Dockerfile.dev diff --git a/docs/computer-use-warning.rst b/docs/computer-use-warning.rst new file mode 100644 index 00000000..d90f162d --- /dev/null +++ b/docs/computer-use-warning.rst @@ -0,0 +1,4 @@ +.. warning:: + + The computer use interface is experimental and has serious security implications. + Please use with caution and see Anthropic's documentation on `computer use `_ for additional guidance. diff --git a/docs/conf.py b/docs/conf.py index 8dbed0c1..f8617bea 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -118,6 +118,7 @@ def setup(app): ("py:class", "flask.app.Flask"), ("py:class", "gptme.tools.python.T"), ("py:class", "threading.Thread"), + ("py:class", "gptme.tools.computer.ScalingSource"), ] # -- Options for HTML output ------------------------------------------------- diff --git a/docs/contributing.rst b/docs/contributing.rst index 03977a1a..abb48644 100644 --- a/docs/contributing.rst +++ b/docs/contributing.rst @@ -17,10 +17,10 @@ Install # checkout the code and navigate to the root of the project git clone https://github.com/ErikBjare/gptme.git cd gptme - + # install poetry (if not installed) pipx install poetry - + # activate the virtualenv poetry shell diff --git a/docs/examples.rst b/docs/examples.rst index 34bb0ad6..72afdd71 100644 --- a/docs/examples.rst +++ b/docs/examples.rst @@ -64,3 +64,30 @@ Generate docstrings for all functions in a file: gptme --non-interactive "Patch these files to include concise docstrings for all functions, skip functions that already have docstrings. Include: brief description, parameters." $@ These examples demonstrate how gptme can be used to create simple yet powerful automation tools. Each script can be easily customized and expanded to fit specific project needs. + +.. rubric:: Computer Use Examples + +Using the computer tool for GUI automation and desktop interaction (requires running the server with computer use support): + +.. code-block:: bash + + # Start server with computer use support + docker run -p 5000:5000 -p 8080:8080 -p 6080:6080 ghcr.io/erikbjare/gptme:latest-server + + # Then in another terminal: + + # Open and interact with an application + gptme 'open firefox and navigate to example.com' + + # GUI automation with visual feedback + gptme 'create a simple drawing in xpaint' + + # Desktop automation with keyboard/mouse + gptme 'open calculator and compute 15 * 23' + +The computer use interface at http://localhost:8080 provides a split view with: +- Chat interface on the left +- Desktop view on the right +- Controls for toggling interaction mode + +This enables complex GUI automation tasks with visual feedback and confirmation. diff --git a/docs/server.rst b/docs/server.rst index 8e33665a..76c3c1cb 100644 --- a/docs/server.rst +++ b/docs/server.rst @@ -16,12 +16,49 @@ It can be started by running the following command: Web UI ------ -.. code-block:: bash +The server provides two interfaces: - gptme-server +1. Basic Chat Interface + + .. code-block:: bash + + gptme-server + + Access the basic chat interface at http://localhost:5000 + + For more usage, see :ref:`the CLI documentation `. + +2. Computer Use Interface + + Requires Docker. + + .. code-block:: bash + + # Clone the repository + git clone https://github.com/ErikBjare/gptme.git + cd gptme + # Build container + make build-docker-computer + # Run container + docker run -v ~/.config/gptme:/home/computeruse/.config/gptme -p 6080:6080 -p 8080:8080 gptme-computer:latest + + The computer use interface provides: + + - Combined view at http://localhost:8080/computer + - Chat view at http://localhost:8080 + - Desktop view at http://localhost:6080/vnc.html + + Features: + + - Split view with chat on the left, desktop on the right + - Toggle for view-only/interactive desktop mode + - Fullscreen support + - Automatic screen scaling for optimal LLM vision -This should let you view your chats in a web browser and make basic requests. + Requirements: -You can then access the web UI by visiting http://localhost:5000 in your browser. + - Docker for running the server with X11 support + - Browser with WebSocket support for VNC + - Network ports 6080 (VNC) and 8080 (web UI) available -For more usage, see :ref:`the CLI documentation `. +.. include:: computer-use-warning.rst diff --git a/docs/tools.rst b/docs/tools.rst index 36f55a9b..05599798 100644 --- a/docs/tools.rst +++ b/docs/tools.rst @@ -26,6 +26,7 @@ The tools can be grouped into the following categories: - `Screenshot`_ - `Vision`_ + - `Computer`_ - Chat management @@ -107,3 +108,38 @@ Chats .. automodule:: gptme.tools.chats :members: :noindex: + +Computer +-------- + +.. automodule:: gptme.tools.computer + :members: + :noindex: + +The computer tool provides direct interaction with the desktop environment through X11, allowing for: + +- Keyboard input simulation +- Mouse control (movement, clicks, dragging) +- Screen capture with automatic scaling +- Cursor position tracking + +To use the computer tool, see the instructions for :doc:`server`. + +Example usage:: + + # Type text + computer(action="type", text="Hello, World!") + + # Move mouse and click + computer(action="mouse_move", coordinate=(100, 100)) + computer(action="left_click") + + # Take screenshot + computer(action="screenshot") + + # Send keyboard shortcuts + computer(action="key", text="Control_L+c") + +The tool automatically handles screen resolution scaling to ensure optimal performance with LLM vision capabilities. + +.. include:: computer-use-warning.rst diff --git a/gptme/server/api.py b/gptme/server/api.py index fda5538e..72bb9eef 100644 --- a/gptme/server/api.py +++ b/gptme/server/api.py @@ -148,6 +148,18 @@ def root(): return current_app.send_static_file("index.html") +# serve computer interface +@api.route("/computer") +def computer(): + return current_app.send_static_file("computer.html") + + +# serve chat interface (for embedding in computer view) +@api.route("/chat") +def chat(): + return current_app.send_static_file("index.html") + + @api.route("/favicon.png") def favicon(): return flask.send_from_directory(media_path, "logo.png") diff --git a/gptme/server/cli.py b/gptme/server/cli.py index 08e1474d..f0decfdf 100644 --- a/gptme/server/cli.py +++ b/gptme/server/cli.py @@ -16,14 +16,36 @@ default=None, help="Model to use by default, can be overridden in each request.", ) -def main(debug: bool, verbose: bool, model: str | None): # pragma: no cover +@click.option( + "--host", + default="127.0.0.1", + help="Host to bind the server to.", +) +@click.option( + "--port", + default="5000", + help="Port to run the server on.", +) +@click.option("--tools", default=None, help="Tools to enable, comma separated.") +def main( + debug: bool, + verbose: bool, + model: str | None, + host: str, + port: str, + tools: str | None, +): # pragma: no cover """ Starts a server and web UI for gptme. Note that this is very much a work in progress, and is not yet ready for normal use. """ init_logging(verbose) - init(model, interactive=False, tool_allowlist=None) + init( + model, + interactive=False, + tool_allowlist=None if tools is None else tools.split(","), + ) # if flask not installed, ask the user to install `server` extras try: @@ -37,4 +59,4 @@ def main(debug: bool, verbose: bool, model: str | None): # pragma: no cover click.echo("Initialization complete, starting server") app = create_app() - app.run(debug=debug) + app.run(debug=debug, host=host, port=int(port)) diff --git a/gptme/server/static/computer.html b/gptme/server/static/computer.html new file mode 100644 index 00000000..33994147 --- /dev/null +++ b/gptme/server/static/computer.html @@ -0,0 +1,92 @@ + + + + gptme - Computer Use + + + + +
+
+ + +
+ +
+
+ + +
+ + + diff --git a/gptme/server/static/index.html b/gptme/server/static/index.html index 9e1263a3..8df9f719 100644 --- a/gptme/server/static/index.html +++ b/gptme/server/static/index.html @@ -151,8 +151,9 @@

{{ selectedConversatio