diff --git a/gptme.toml b/gptme.toml
index 0e0b2ab5..df5b7fda 100644
--- a/gptme.toml
+++ b/gptme.toml
@@ -1 +1 @@
-files = ["README.md", "gptme/cli.py", "docs/*.rst", "docs/*.md"]
+files = ["README.md", "Makefile", "gptme/cli.py", "docs/*.rst", "docs/*.md"]
diff --git a/gptme/cli.py b/gptme/cli.py
index 2ad54004..ddf70528 100644
--- a/gptme/cli.py
+++ b/gptme/cli.py
@@ -114,7 +114,7 @@
 )
 @click.option(
     "--workspace",
-    help="Path to workspace directory.",
+    help="Path to workspace directory. Pass '@log' to create a workspace in the log directory.",
     default=".",
 )
 def main(
@@ -153,13 +153,8 @@ def main(
     if no_confirm:
         logger.warning("Skipping all confirmation prompts.")
 
-    workspace_prompt = get_workspace_prompt(workspace)
-
     # get initial system prompt
     initial_msgs = [get_prompt(prompt_system)]
-    initial_msgs[
-        0
-    ].content += f"\n\nSelected project files, read more with cat: {workspace_prompt}"
 
     # if stdin is not a tty, we're getting piped input, which we should include in the prompt
     if not sys.stdin.isatty():
@@ -196,6 +191,7 @@ def main(
         no_confirm,
         interactive,
         show_hidden,
+        workspace,
     )
 
 
@@ -208,12 +204,14 @@ def chat(
     no_confirm: bool = False,
     interactive: bool = True,
     show_hidden: bool = False,
+    workspace: str = ".",
 ):
     """
     Run the chat loop.
 
     prompt_msgs: list of messages to execute in sequence.
     initial_msgs: list of history messages.
+    workspace: path to workspace directory, or @log to create one in the log directory.
 
     Callable from other modules.
     """
@@ -227,6 +225,28 @@ def chat(
     print(f"Using logdir {logfile.parent}")
     log = LogManager.load(logfile, initial_msgs=initial_msgs, show_hidden=show_hidden)
 
+    # change to workspace directory
+    # use if exists, create if @log, or use given path
+    if (logfile.parent / "workspace").exists():
+        assert workspace in ["@log", "."], "Workspace already exists"
+        workspace_path = logfile.parent / "workspace"
+        print(f"Using workspace at {workspace_path}")
+    elif workspace == "@log":
+        workspace_path = logfile.parent / "workspace"
+        print(f"Creating workspace at {workspace_path}")
+        os.makedirs(workspace_path, exist_ok=True)
+    else:
+        workspace_path = Path(workspace)
+        assert (
+            workspace_path.exists()
+        ), f"Workspace path {workspace_path} does not exist"
+    os.chdir(workspace_path)
+
+    # check if workspace already exists
+    workspace_prompt = get_workspace_prompt(str(workspace_path))
+    if workspace_prompt:
+        log.append(Message("system", workspace_prompt))
+
     # print log
     log.print()
     print("--- ^^^ past messages ^^^ ---")
@@ -388,8 +408,8 @@ def get_logfile(name: str | Literal["random", "resume"], interactive=True) -> Pa
         for f in prev_conv_files
     ]
 
-    # don't run pick in tests/non-interactive mode
-    if interactive:
+    # don't run pick in tests/non-interactive mode, or if the user specifies a name
+    if interactive and name not in ["random", "ask"]:
         options = [
             NEW_CONV,
         ] + prev_convs
diff --git a/gptme/config.py b/gptme/config.py
index 17ca5766..404ace5c 100644
--- a/gptme/config.py
+++ b/gptme/config.py
@@ -121,10 +121,6 @@ def set_config_value(key: str, value: str) -> None:
 
 
 def get_workspace_prompt(workspace: str) -> str:
-    if not os.path.exists(workspace):
-        logger.error(f"Workspace directory {workspace} does not exist")
-        exit(1)
-    os.chdir(workspace)
     project_config_paths = [
         p
         for p in (
@@ -148,7 +144,7 @@ def get_workspace_prompt(workspace: str) -> str:
                         f"File {file} specified in project config does not exist"
                     )
                     exit(1)
-        return "\n\nSelected project files, read more with cat:\n" + "\n".join(
+        return "\n\nSelected project files, read more with cat:\n" + "\n\n".join(
             [f"```{Path(file).name}\n{Path(file).read_text()}\n```" for file in files]
         )
     return ""
diff --git a/gptme/eval/agents.py b/gptme/eval/agents.py
index 87d8b8f5..d55aa1cf 100644
--- a/gptme/eval/agents.py
+++ b/gptme/eval/agents.py
@@ -1,13 +1,16 @@
-import os
+import logging
 from abc import abstractmethod
 
 from gptme import Message
 from gptme import chat as gptme_chat
 from gptme import get_prompt
+from gptme.cli import get_name
 
 from .filestore import FileStore
 from .types import Files
 
+logger = logging.getLogger(__name__)
+
 
 class Agent:
     def __init__(self, model: str):
@@ -23,14 +26,21 @@ def act(self, files: Files | None, prompt: str) -> Files:
 
 class GPTMe(Agent):
     def act(self, files: Files | None, prompt: str):
-        store = FileStore()
-        os.chdir(store.working_dir)  # can now modify store content
+        _id = abs(hash(prompt)) % 1000000
+        name = f"gptme-evals-{self.model.replace('/', '--')}-{_id}"
+        logdir = get_name(name)
+        workspace_dir = logdir / "workspace"
+        if workspace_dir.exists():
+            raise FileExistsError(
+                f"Workspace directory {workspace_dir} already exists. "
+            )
 
+        store = FileStore(working_dir=workspace_dir)
         if files:
             store.upload(files)
 
         print("\n--- Start of generation ---")
-        print(f"Working in {store.working_dir}")
+        logger.debug(f"Working in {store.working_dir}")
         prompt_sys = get_prompt()
         prompt_sys.content += (
             "\n\nIf you have trouble and dont seem to make progress, stop trying."
@@ -40,10 +50,11 @@ def act(self, files: Files | None, prompt: str):
             gptme_chat(
                 [Message("user", prompt)],
                 [prompt_sys],
-                f"gptme-evals-{store.id}",
+                name=name,
                 model=self.model,
                 no_confirm=True,
                 interactive=False,
+                workspace="@log",  # this will be the same directory as workspace_dir
             )
         # don't exit on sys.exit()
         except (SystemExit, KeyboardInterrupt):
diff --git a/gptme/eval/filestore.py b/gptme/eval/filestore.py
index dbca76f0..57a0196f 100644
--- a/gptme/eval/filestore.py
+++ b/gptme/eval/filestore.py
@@ -6,10 +6,12 @@
 
 
 class FileStore:
-    def __init__(self):
-        self.working_dir = Path(tempfile.mkdtemp(prefix="gptme-evals-"))
+    def __init__(self, working_dir: Path | None = None):
+        if working_dir:
+            self.working_dir = working_dir
+        else:
+            self.working_dir = Path(tempfile.mkdtemp(prefix="gptme-evals-"))
         self.working_dir.mkdir(parents=True, exist_ok=True)
-        self.id = self.working_dir.name.split("-")[-1]
 
     def upload(self, files: Files):
         for name, content in files.items():
diff --git a/gptme/eval/main.py b/gptme/eval/main.py
index 050604ac..f1fef6a3 100644
--- a/gptme/eval/main.py
+++ b/gptme/eval/main.py
@@ -65,23 +65,43 @@ class ProcessError:
 ProcessResult = Union[ProcessSuccess, ProcessError]
 
 
+class StreamTee(io.TextIOBase):
+    """Capture stdout or stderr to a stream and optionally keep original streams intact."""
+
+    # NOTE: toggling keep_stream can be useful for debugging
+    def __init__(self, stream, keep_stream=False):
+        self.stream = stream
+        self.captured = io.StringIO()
+        self.keep_stream = keep_stream
+
+    def write(self, message) -> int:
+        self.captured.write(message)
+        if self.keep_stream:
+            self.stream.write(message)
+        return len(message)
+
+    def getvalue(self):
+        return self.captured.getvalue()
+
+
 def act_process(agent, files, prompt, queue: "Queue[ProcessResult]"):
     # Runs in a process for each eval
     # each eval has a process group, so we can kill all child processes
     os.setpgrp()
 
     # redirect stdout and stderr to streams
-    stdout, stderr = io.StringIO(), io.StringIO()
-    stdout_orig, stderr_orig = sys.stdout, sys.stderr
-    sys.stdout, sys.stderr = stdout, stderr
+    stdout = StreamTee(sys.stdout)
+    stderr = StreamTee(sys.stderr)
+    sys.stdout, sys.stderr = stdout, stderr  # type: ignore
 
     def error_handler(e):
         duration = time.time() - start
-        sys.stdout, sys.stderr = stdout_orig, stderr_orig
+        sys.stdout, sys.stderr = stdout.stream, stderr.stream
         print(f"Error: {e}")
         queue.put(ProcessError(str(e), stdout.getvalue(), stderr.getvalue(), duration))
         # kill child processes
         # os.killpg(0, signal.SIGKILL)
+
         sys.exit(1)
 
     # handle SIGTERM
@@ -93,9 +113,9 @@ def sigterm_handler(*_):
     start = time.time()
     files = agent.act(files, prompt)
     duration = time.time() - start
-    sys.stdout, sys.stderr = stdout_orig, stderr_orig
+    sys.stdout, sys.stderr = stdout.stream, stderr.stream
     queue.put(ProcessSuccess(files, stdout.getvalue(), stderr.getvalue(), duration))
-    print("Process finished")
+    print("Process finished successfully")
     # It seems that adding this prevents the queue from syncing or something, maybe SIGKILL is too harsh...
     # os.killpg(0, signal.SIGKILL)
 
@@ -105,7 +125,7 @@ def execute(test: ExecTest, agent: Agent, timeout: int) -> ExecResult:
     """
     Executes the code for a specific model with a timeout.
     """
-    print(
+    logger.info(
         f'Running "{test["name"]}" with prompt "{test["prompt"]}" for model: {agent.model}'
     )
 
@@ -120,7 +140,7 @@ def execute(test: ExecTest, agent: Agent, timeout: int) -> ExecResult:
 
     status: Status = "success"
     if p.is_alive():
-        print("Timeout reached, terminating process")
+        logger.info("Timeout reached, terminating process")
         p.terminate()
         p.join(timeout=1)
         status = "timeout"
@@ -141,7 +161,7 @@ def execute(test: ExecTest, agent: Agent, timeout: int) -> ExecResult:
         }
 
     logger.info("Got result")
-    if status == "success":
+    if status != "timeout":
         time_gen = result.duration
     stdout, stderr = result.stdout, result.stderr
 
@@ -169,7 +189,7 @@ def execute(test: ExecTest, agent: Agent, timeout: int) -> ExecResult:
 
     ctx = ResultContext(files, stdout_run, stderr_run, exit_code)
     results: list[CaseResult] = []
-    print(f"\n--- Results for {test['name']} ---")
+    print(f"\n--- Results for '{test['name']}' with {agent.model} ---")
     for name, case in test["expect"].items():
         code = inspect.getsource(case).strip()
         eval_start = time.time()