diff --git a/gptme.toml b/gptme.toml index 0e0b2ab5..df5b7fda 100644 --- a/gptme.toml +++ b/gptme.toml @@ -1 +1 @@ -files = ["README.md", "gptme/cli.py", "docs/*.rst", "docs/*.md"] +files = ["README.md", "Makefile", "gptme/cli.py", "docs/*.rst", "docs/*.md"] diff --git a/gptme/cli.py b/gptme/cli.py index 2ad54004..ddf70528 100644 --- a/gptme/cli.py +++ b/gptme/cli.py @@ -114,7 +114,7 @@ ) @click.option( "--workspace", - help="Path to workspace directory.", + help="Path to workspace directory. Pass '@log' to create a workspace in the log directory.", default=".", ) def main( @@ -153,13 +153,8 @@ def main( if no_confirm: logger.warning("Skipping all confirmation prompts.") - workspace_prompt = get_workspace_prompt(workspace) - # get initial system prompt initial_msgs = [get_prompt(prompt_system)] - initial_msgs[ - 0 - ].content += f"\n\nSelected project files, read more with cat: {workspace_prompt}" # if stdin is not a tty, we're getting piped input, which we should include in the prompt if not sys.stdin.isatty(): @@ -196,6 +191,7 @@ def main( no_confirm, interactive, show_hidden, + workspace, ) @@ -208,12 +204,14 @@ def chat( no_confirm: bool = False, interactive: bool = True, show_hidden: bool = False, + workspace: str = ".", ): """ Run the chat loop. prompt_msgs: list of messages to execute in sequence. initial_msgs: list of history messages. + workspace: path to workspace directory, or @log to create one in the log directory. Callable from other modules. """ @@ -227,6 +225,28 @@ def chat( print(f"Using logdir {logfile.parent}") log = LogManager.load(logfile, initial_msgs=initial_msgs, show_hidden=show_hidden) + # change to workspace directory + # use if exists, create if @log, or use given path + if (logfile.parent / "workspace").exists(): + assert workspace in ["@log", "."], "Workspace already exists" + workspace_path = logfile.parent / "workspace" + print(f"Using workspace at {workspace_path}") + elif workspace == "@log": + workspace_path = logfile.parent / "workspace" + print(f"Creating workspace at {workspace_path}") + os.makedirs(workspace_path, exist_ok=True) + else: + workspace_path = Path(workspace) + assert ( + workspace_path.exists() + ), f"Workspace path {workspace_path} does not exist" + os.chdir(workspace_path) + + # check if workspace already exists + workspace_prompt = get_workspace_prompt(str(workspace_path)) + if workspace_prompt: + log.append(Message("system", workspace_prompt)) + # print log log.print() print("--- ^^^ past messages ^^^ ---") @@ -388,8 +408,8 @@ def get_logfile(name: str | Literal["random", "resume"], interactive=True) -> Pa for f in prev_conv_files ] - # don't run pick in tests/non-interactive mode - if interactive: + # don't run pick in tests/non-interactive mode, or if the user specifies a name + if interactive and name not in ["random", "ask"]: options = [ NEW_CONV, ] + prev_convs diff --git a/gptme/config.py b/gptme/config.py index 17ca5766..404ace5c 100644 --- a/gptme/config.py +++ b/gptme/config.py @@ -121,10 +121,6 @@ def set_config_value(key: str, value: str) -> None: def get_workspace_prompt(workspace: str) -> str: - if not os.path.exists(workspace): - logger.error(f"Workspace directory {workspace} does not exist") - exit(1) - os.chdir(workspace) project_config_paths = [ p for p in ( @@ -148,7 +144,7 @@ def get_workspace_prompt(workspace: str) -> str: f"File {file} specified in project config does not exist" ) exit(1) - return "\n\nSelected project files, read more with cat:\n" + "\n".join( + return "\n\nSelected project files, read more with cat:\n" + "\n\n".join( [f"```{Path(file).name}\n{Path(file).read_text()}\n```" for file in files] ) return "" diff --git a/gptme/eval/agents.py b/gptme/eval/agents.py index 87d8b8f5..d55aa1cf 100644 --- a/gptme/eval/agents.py +++ b/gptme/eval/agents.py @@ -1,13 +1,16 @@ -import os +import logging from abc import abstractmethod from gptme import Message from gptme import chat as gptme_chat from gptme import get_prompt +from gptme.cli import get_name from .filestore import FileStore from .types import Files +logger = logging.getLogger(__name__) + class Agent: def __init__(self, model: str): @@ -23,14 +26,21 @@ def act(self, files: Files | None, prompt: str) -> Files: class GPTMe(Agent): def act(self, files: Files | None, prompt: str): - store = FileStore() - os.chdir(store.working_dir) # can now modify store content + _id = abs(hash(prompt)) % 1000000 + name = f"gptme-evals-{self.model.replace('/', '--')}-{_id}" + logdir = get_name(name) + workspace_dir = logdir / "workspace" + if workspace_dir.exists(): + raise FileExistsError( + f"Workspace directory {workspace_dir} already exists. " + ) + store = FileStore(working_dir=workspace_dir) if files: store.upload(files) print("\n--- Start of generation ---") - print(f"Working in {store.working_dir}") + logger.debug(f"Working in {store.working_dir}") prompt_sys = get_prompt() prompt_sys.content += ( "\n\nIf you have trouble and dont seem to make progress, stop trying." @@ -40,10 +50,11 @@ def act(self, files: Files | None, prompt: str): gptme_chat( [Message("user", prompt)], [prompt_sys], - f"gptme-evals-{store.id}", + name=name, model=self.model, no_confirm=True, interactive=False, + workspace="@log", # this will be the same directory as workspace_dir ) # don't exit on sys.exit() except (SystemExit, KeyboardInterrupt): diff --git a/gptme/eval/filestore.py b/gptme/eval/filestore.py index dbca76f0..57a0196f 100644 --- a/gptme/eval/filestore.py +++ b/gptme/eval/filestore.py @@ -6,10 +6,12 @@ class FileStore: - def __init__(self): - self.working_dir = Path(tempfile.mkdtemp(prefix="gptme-evals-")) + def __init__(self, working_dir: Path | None = None): + if working_dir: + self.working_dir = working_dir + else: + self.working_dir = Path(tempfile.mkdtemp(prefix="gptme-evals-")) self.working_dir.mkdir(parents=True, exist_ok=True) - self.id = self.working_dir.name.split("-")[-1] def upload(self, files: Files): for name, content in files.items(): diff --git a/gptme/eval/main.py b/gptme/eval/main.py index 050604ac..f1fef6a3 100644 --- a/gptme/eval/main.py +++ b/gptme/eval/main.py @@ -65,23 +65,43 @@ class ProcessError: ProcessResult = Union[ProcessSuccess, ProcessError] +class StreamTee(io.TextIOBase): + """Capture stdout or stderr to a stream and optionally keep original streams intact.""" + + # NOTE: toggling keep_stream can be useful for debugging + def __init__(self, stream, keep_stream=False): + self.stream = stream + self.captured = io.StringIO() + self.keep_stream = keep_stream + + def write(self, message) -> int: + self.captured.write(message) + if self.keep_stream: + self.stream.write(message) + return len(message) + + def getvalue(self): + return self.captured.getvalue() + + def act_process(agent, files, prompt, queue: "Queue[ProcessResult]"): # Runs in a process for each eval # each eval has a process group, so we can kill all child processes os.setpgrp() # redirect stdout and stderr to streams - stdout, stderr = io.StringIO(), io.StringIO() - stdout_orig, stderr_orig = sys.stdout, sys.stderr - sys.stdout, sys.stderr = stdout, stderr + stdout = StreamTee(sys.stdout) + stderr = StreamTee(sys.stderr) + sys.stdout, sys.stderr = stdout, stderr # type: ignore def error_handler(e): duration = time.time() - start - sys.stdout, sys.stderr = stdout_orig, stderr_orig + sys.stdout, sys.stderr = stdout.stream, stderr.stream print(f"Error: {e}") queue.put(ProcessError(str(e), stdout.getvalue(), stderr.getvalue(), duration)) # kill child processes # os.killpg(0, signal.SIGKILL) + sys.exit(1) # handle SIGTERM @@ -93,9 +113,9 @@ def sigterm_handler(*_): start = time.time() files = agent.act(files, prompt) duration = time.time() - start - sys.stdout, sys.stderr = stdout_orig, stderr_orig + sys.stdout, sys.stderr = stdout.stream, stderr.stream queue.put(ProcessSuccess(files, stdout.getvalue(), stderr.getvalue(), duration)) - print("Process finished") + print("Process finished successfully") # It seems that adding this prevents the queue from syncing or something, maybe SIGKILL is too harsh... # os.killpg(0, signal.SIGKILL) @@ -105,7 +125,7 @@ def execute(test: ExecTest, agent: Agent, timeout: int) -> ExecResult: """ Executes the code for a specific model with a timeout. """ - print( + logger.info( f'Running "{test["name"]}" with prompt "{test["prompt"]}" for model: {agent.model}' ) @@ -120,7 +140,7 @@ def execute(test: ExecTest, agent: Agent, timeout: int) -> ExecResult: status: Status = "success" if p.is_alive(): - print("Timeout reached, terminating process") + logger.info("Timeout reached, terminating process") p.terminate() p.join(timeout=1) status = "timeout" @@ -141,7 +161,7 @@ def execute(test: ExecTest, agent: Agent, timeout: int) -> ExecResult: } logger.info("Got result") - if status == "success": + if status != "timeout": time_gen = result.duration stdout, stderr = result.stdout, result.stderr @@ -169,7 +189,7 @@ def execute(test: ExecTest, agent: Agent, timeout: int) -> ExecResult: ctx = ResultContext(files, stdout_run, stderr_run, exit_code) results: list[CaseResult] = [] - print(f"\n--- Results for {test['name']} ---") + print(f"\n--- Results for '{test['name']}' with {agent.model} ---") for name, case in test["expect"].items(): code = inspect.getsource(case).strip() eval_start = time.time()