Skip to content

Commit

Permalink
feat(forge/llm): Add LlamafileProvider (#7091)
Browse files Browse the repository at this point in the history
* Add minimal implementation of `LlamafileProvider`, a new `ChatModelProvider` for llamafiles. It extends `BaseOpenAIProvider` and only overrides methods that are necessary to get the system to work at a basic level.

* Add support for `mistral-7b-instruct-v0.2`. This is the only model currently supported by `LlamafileProvider` because this is the only model I tested anything with.

* Add instructions to use AutoGPT with llamafile in the docs at `autogpt/setup/index.md`
* Add helper script to get it running quickly at `scripts/llamafile/serve.py`

---------

Co-authored-by: Reinier van der Leer <[email protected]>
  • Loading branch information
k8si and Pwuts authored Jul 17, 2024
1 parent 97a5582 commit 62c420e
Show file tree
Hide file tree
Showing 10 changed files with 680 additions and 10 deletions.
3 changes: 3 additions & 0 deletions autogpt/.env.template
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,9 @@
## GROQ_API_KEY - Groq API Key (Example: gsk_xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx)
# GROQ_API_KEY=

## LLAMAFILE_API_BASE - Llamafile API base URL
# LLAMAFILE_API_BASE=http://localhost:8080/v1

## TELEMETRY_OPT_IN - Share telemetry on errors and other issues with the AutoGPT team, e.g. through Sentry.
## This helps us to spot and solve problems earlier & faster. (Default: DISABLED)
# TELEMETRY_OPT_IN=true
Expand Down
3 changes: 3 additions & 0 deletions autogpt/scripts/llamafile/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
*.llamafile
*.llamafile.exe
llamafile.exe
165 changes: 165 additions & 0 deletions autogpt/scripts/llamafile/serve.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,165 @@
#!/usr/bin/env python3
"""
Use llamafile to serve a (quantized) mistral-7b-instruct-v0.2 model
Usage:
cd <repo-root>/autogpt
./scripts/llamafile/serve.py
"""

import os
import platform
import subprocess
from pathlib import Path
from typing import Optional

import click

LLAMAFILE = Path("mistral-7b-instruct-v0.2.Q5_K_M.llamafile")
LLAMAFILE_URL = f"https://huggingface.co/jartine/Mistral-7B-Instruct-v0.2-llamafile/resolve/main/{LLAMAFILE.name}" # noqa
LLAMAFILE_EXE = Path("llamafile.exe")
LLAMAFILE_EXE_URL = "https://github.com/Mozilla-Ocho/llamafile/releases/download/0.8.6/llamafile-0.8.6" # noqa


@click.command()
@click.option(
"--llamafile",
type=click.Path(dir_okay=False, path_type=Path),
help=f"Name of the llamafile to serve. Default: {LLAMAFILE.name}",
)
@click.option("--llamafile_url", help="Download URL for the llamafile you want to use")
@click.option(
"--host", help="Specify the address for the llamafile server to listen on"
)
@click.option(
"--port", type=int, help="Specify the port for the llamafile server to listen on"
)
@click.option(
"--force-gpu",
is_flag=True,
hidden=platform.system() != "Darwin",
help="Run the model using only the GPU (AMD or Nvidia). "
"Otherwise, both CPU and GPU may be (partially) used.",
)
def main(
llamafile: Optional[Path] = None,
llamafile_url: Optional[str] = None,
host: Optional[str] = None,
port: Optional[int] = None,
force_gpu: bool = False,
):
print(f"type(llamafile) = {type(llamafile)}")
if not llamafile:
if not llamafile_url:
llamafile = LLAMAFILE
else:
llamafile = Path(llamafile_url.rsplit("/", 1)[1])
if llamafile.suffix != ".llamafile":
click.echo(
click.style(
"The given URL does not end with '.llamafile' -> "
"can't get filename from URL. "
"Specify the filename using --llamafile.",
fg="red",
),
err=True,
)
return

if llamafile == LLAMAFILE and not llamafile_url:
llamafile_url = LLAMAFILE_URL
elif llamafile_url != LLAMAFILE_URL:
if not click.prompt(
click.style(
"You seem to have specified a different URL for the default model "
f"({llamafile.name}). Are you sure this is correct? "
"If you want to use a different model, also specify --llamafile.",
fg="yellow",
),
type=bool,
):
return

# Go to autogpt/scripts/llamafile/
os.chdir(Path(__file__).resolve().parent)

on_windows = platform.system() == "Windows"

if not llamafile.is_file():
if not llamafile_url:
click.echo(
click.style(
"Please use --lamafile_url to specify a download URL for "
f"'{llamafile.name}'. "
"This will only be necessary once, so we can download the model.",
fg="red",
),
err=True,
)
return

download_file(llamafile_url, llamafile)

if not on_windows:
llamafile.chmod(0o755)
subprocess.run([llamafile, "--version"], check=True)

if not on_windows:
base_command = [f"./{llamafile}"]
else:
# Windows does not allow executables over 4GB, so we have to download a
# model-less llamafile.exe and run that instead.
if not LLAMAFILE_EXE.is_file():
download_file(LLAMAFILE_EXE_URL, LLAMAFILE_EXE)
LLAMAFILE_EXE.chmod(0o755)
subprocess.run([f".\\{LLAMAFILE_EXE}", "--version"], check=True)

base_command = [f".\\{LLAMAFILE_EXE}", "-m", llamafile]

if host:
base_command.extend(["--host", host])
if port:
base_command.extend(["--port", str(port)])
if force_gpu:
base_command.extend(["-ngl", "9999"])

subprocess.run(
[
*base_command,
"--server",
"--nobrowser",
"--ctx-size",
"0",
"--n-predict",
"1024",
],
check=True,
)

# note: --ctx-size 0 means the prompt context size will be set directly from the
# underlying model configuration. This may cause slow response times or consume
# a lot of memory.


def download_file(url: str, to_file: Path) -> None:
print(f"Downloading {to_file.name}...")
import urllib.request

urllib.request.urlretrieve(url, to_file, reporthook=report_download_progress)
print()


def report_download_progress(chunk_number: int, chunk_size: int, total_size: int):
if total_size != -1:
downloaded_size = chunk_number * chunk_size
percent = min(1, downloaded_size / total_size)
bar = "#" * int(40 * percent)
print(
f"\rDownloading: [{bar:<40}] {percent:.0%}"
f" - {downloaded_size/1e6:.1f}/{total_size/1e6:.1f} MB",
end="",
)


if __name__ == "__main__":
main()
1 change: 1 addition & 0 deletions docs/content/AutoGPT/configuration/options.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ You can set configuration variables via the `.env` file. If you don't have a `.e
- `GROQ_API_KEY`: Set this if you want to use Groq models with AutoGPT
- `HUGGINGFACE_API_TOKEN`: HuggingFace API, to be used for both image generation and audio to text. Optional.
- `HUGGINGFACE_IMAGE_MODEL`: HuggingFace model to use for image generation. Default: CompVis/stable-diffusion-v1-4
- `LLAMAFILE_API_BASE`: Llamafile API base URL. Default: `http://localhost:8080/v1`
- `OPENAI_API_KEY`: Set this if you want to use OpenAI models; [OpenAI API Key](https://platform.openai.com/account/api-keys).
- `OPENAI_ORGANIZATION`: Organization ID in OpenAI. Optional.
- `PLAIN_OUTPUT`: Plain output, which disables the spinner. Default: False
Expand Down
63 changes: 63 additions & 0 deletions docs/content/AutoGPT/setup/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -198,3 +198,66 @@ If you don't know which to choose, you can safely go with OpenAI*.
[groq/api-keys]: https://console.groq.com/keys
[groq/models]: https://console.groq.com/docs/models
### Llamafile
With llamafile you can run models locally, which means no need to set up billing,
and guaranteed data privacy.
For more information and in-depth documentation, check out the [llamafile documentation].
!!! warning
At the moment, llamafile only serves one model at a time. This means you can not
set `SMART_LLM` and `FAST_LLM` to two different llamafile models.
!!! warning
Due to the issues linked below, llamafiles don't work on WSL. To use a llamafile
with AutoGPT in WSL, you will have to run the llamafile in Windows (outside WSL).

<details>
<summary>Instructions</summary>

1. Get the `llamafile/serve.py` script through one of these two ways:
1. Clone the AutoGPT repo somewhere in your Windows environment,
with the script located at `autogpt/scripts/llamafile/serve.py`
2. Download just the [serve.py] script somewhere in your Windows environment
2. Make sure you have `click` installed: `pip install click`
3. Run `ip route | grep default | awk '{print $3}'` *inside WSL* to get the address
of the WSL host machine
4. Run `python3 serve.py --host {WSL_HOST_ADDR}`, where `{WSL_HOST_ADDR}`
is the address you found at step 3.
If port 8080 is taken, also specify a different port using `--port {PORT}`.
5. In WSL, set `LLAMAFILE_API_BASE=http://{WSL_HOST_ADDR}:8080/v1` in your `.env`.
6. Follow the rest of the regular instructions below.

[serve.py]: https://github.com/Significant-Gravitas/AutoGPT/blob/master/autogpt/scripts/llamafile/serve.py
</details>

* [Mozilla-Ocho/llamafile#356](https://github.com/Mozilla-Ocho/llamafile/issues/356)
* [Mozilla-Ocho/llamafile#100](https://github.com/Mozilla-Ocho/llamafile/issues/100)

!!! note
These instructions will download and use `mistral-7b-instruct-v0.2.Q5_K_M.llamafile`.
`mistral-7b-instruct-v0.2` is currently the only tested and supported model.
If you want to try other models, you'll have to add them to `LlamafileModelName` in
[`llamafile.py`][forge/llamafile.py].
For optimal results, you may also have to add some logic to adapt the message format,
like `LlamafileProvider._adapt_chat_messages_for_mistral_instruct(..)` does.
1. Run the llamafile serve script:
```shell
python3 ./scripts/llamafile/serve.py
```
The first time this is run, it will download a file containing the model + runtime,
which may take a while and a few gigabytes of disk space.
To force GPU acceleration, add `--use-gpu` to the command.
3. In `.env`, set `SMART_LLM`/`FAST_LLM` or both to `mistral-7b-instruct-v0.2`
4. If the server is running on different address than `http://localhost:8080/v1`,
set `LLAMAFILE_API_BASE` in `.env` to the right base URL
[llamafile documentation]: https://github.com/Mozilla-Ocho/llamafile#readme
[forge/llamafile.py]: https://github.com/Significant-Gravitas/AutoGPT/blob/master/forge/forge/llm/providers/llamafile/llamafile.py
36 changes: 36 additions & 0 deletions forge/forge/llm/providers/llamafile/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
# Llamafile Integration Notes

Tested with:
* Python 3.11
* Apple M2 Pro (32 GB), macOS 14.2.1
* quantized mistral-7b-instruct-v0.2

## Setup

Download a `mistral-7b-instruct-v0.2` llamafile:
```shell
wget -nc https://huggingface.co/jartine/Mistral-7B-Instruct-v0.2-llamafile/resolve/main/mistral-7b-instruct-v0.2.Q5_K_M.llamafile
chmod +x mistral-7b-instruct-v0.2.Q5_K_M.llamafile
./mistral-7b-instruct-v0.2.Q5_K_M.llamafile --version
```

Run the llamafile server:
```shell
LLAMAFILE="./mistral-7b-instruct-v0.2.Q5_K_M.llamafile"

"${LLAMAFILE}" \
--server \
--nobrowser \
--ctx-size 0 \
--n-predict 1024

# note: ctx-size=0 means the prompt context size will be set directly from the
# underlying model configuration. This may cause slow response times or consume
# a lot of memory.
```

## TODOs

* `SMART_LLM`/`FAST_LLM` configuration: Currently, the llamafile server only serves one model at a time. However, there's no reason you can't start multiple llamafile servers on different ports. To support using different models for `smart_llm` and `fast_llm`, you could implement config vars like `LLAMAFILE_SMART_LLM_URL` and `LLAMAFILE_FAST_LLM_URL` that point to different llamafile servers (one serving a 'big model' and one serving a 'fast model').
* Authorization: the `serve.sh` script does not set up any authorization for the llamafile server; this can be turned on by adding arg `--api-key <some-key>` to the server startup command. However I haven't attempted to test whether the integration with autogpt works when this feature is turned on.
* Test with other models
17 changes: 17 additions & 0 deletions forge/forge/llm/providers/llamafile/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
from .llamafile import (
LLAMAFILE_CHAT_MODELS,
LLAMAFILE_EMBEDDING_MODELS,
LlamafileCredentials,
LlamafileModelName,
LlamafileProvider,
LlamafileSettings,
)

__all__ = [
"LLAMAFILE_CHAT_MODELS",
"LLAMAFILE_EMBEDDING_MODELS",
"LlamafileCredentials",
"LlamafileModelName",
"LlamafileProvider",
"LlamafileSettings",
]
Loading

0 comments on commit 62c420e

Please sign in to comment.