From c353e4cb733c09bfc68ed307c8adb408d2276d0a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Erik=20Bj=C3=A4reholt?= Date: Sat, 14 Dec 2024 18:40:28 +0100 Subject: [PATCH] fix: improved path detection in prompts, now works with more adjacent punctuation types --- gptme/chat.py | 108 +++++++++++++++++++++++++++++---------------- tests/test_chat.py | 79 +++++++++++++++++++++++++++++++++ 2 files changed, 148 insertions(+), 39 deletions(-) create mode 100644 tests/test_chat.py diff --git a/gptme/chat.py b/gptme/chat.py index a47b40d2..ed68a164 100644 --- a/gptme/chat.py +++ b/gptme/chat.py @@ -278,6 +278,60 @@ def prompt_input(prompt: str, value=None) -> str: # pragma: no cover return value +def _find_potential_paths(content: str) -> list[str]: + """ + Find potential file paths and URLs in a message content. + Excludes content within code blocks. + + Args: + content: The message content to search + + Returns: + List of potential paths/URLs found in the message + """ + # Remove code blocks to avoid matching paths inside them + content_no_codeblocks = re.sub(r"```[\s\S]*?```", "", content) + + # List current directory contents for relative path matching + cwd_files = [f.name for f in Path.cwd().iterdir()] + + paths = [] + + def is_path_like(word: str) -> bool: + """Helper to check if a word looks like a path""" + return ( + # Absolute/home/relative paths + any(word.startswith(s) for s in ["/", "~/", "./"]) + # URLs + or word.startswith("http") + # Contains slash (for backtick-wrapped paths) + or "/" in word + # Files in current directory or subdirectories + or any(word.split("/", 1)[0] == file for file in cwd_files) + ) + + # First find backtick-wrapped content + for match in re.finditer(r"`([^`]+)`", content_no_codeblocks): + word = match.group(1).strip() + word = word.rstrip("?").rstrip(".").rstrip(",").rstrip("!") + if is_path_like(word): + paths.append(word) + + # Then find non-backtick-wrapped words + # Remove backtick-wrapped content first to avoid double-processing + content_no_backticks = re.sub(r"`[^`]+`", "", content_no_codeblocks) + for word in re.split(r"\s+", content_no_backticks): + word = word.strip() + word = word.rstrip("?").rstrip(".").rstrip(",").rstrip("!") + if not word: + continue + + if is_path_like(word): + paths.append(word) + + return paths + + def _include_paths(msg: Message, workspace: Path | None = None) -> Message: """ Searches the message for any valid paths and: @@ -296,48 +350,24 @@ def _include_paths(msg: Message, workspace: Path | None = None) -> Message: # TODO: add support for directories? assert msg.role == "user" - # list the current directory - cwd_files = [f.name for f in Path.cwd().iterdir()] - - # match absolute, home, relative paths, and URLs anywhere in the message - # could be wrapped with spaces or backticks, possibly followed by a question mark - # don't look in codeblocks, and don't match paths that are already in codeblocks - # TODO: this will misbehave if there are codeblocks (or triple backticks) in codeblocks - content_no_codeblocks = re.sub(r"```.*?\n```", "", msg.content, flags=re.DOTALL) - append_msg = "" files = [] - for word in re.split(r"[\s`]", content_no_codeblocks): - # remove wrapping backticks - word = word.strip("`") - # remove trailing question mark - word = word.rstrip("?") - if not word: - continue - if ( - # if word starts with a path character - any(word.startswith(s) for s in ["/", "~/", "./"]) - # or word is a URL - or word.startswith("http") - # or word is a file in the current dir, - # or a path that starts in a folder in the current dir - or any(word.split("/", 1)[0] == file for file in cwd_files) - ): - logger.debug(f"potential path/url: {word=}") - # If not using fresh context, include text file contents in the message - if not use_fresh_context and (contents := _parse_prompt(word)): - # if we found a valid path, replace it with the contents of the file - append_msg += "\n\n" + contents - else: - # if we found an non-text file, include it in msg.files - file = _parse_prompt_files(word) - if file: - # Store path relative to workspace if provided - file = file.expanduser() - if workspace and not file.is_absolute(): - file = file.absolute().relative_to(workspace) - files.append(file) + # Find potential paths in message + for word in _find_potential_paths(msg.content): + logger.debug(f"potential path/url: {word=}") + # If not using fresh context, include text file contents in the message + if not use_fresh_context and (contents := _parse_prompt(word)): + append_msg += "\n\n" + contents + else: + # if we found an non-text file, include it in msg.files + file = _parse_prompt_files(word) + if file: + # Store path relative to workspace if provided + file = file.expanduser() + if workspace and not file.is_absolute(): + file = file.absolute().relative_to(workspace) + files.append(file) if files: msg = msg.replace(files=msg.files + files) diff --git a/tests/test_chat.py b/tests/test_chat.py new file mode 100644 index 00000000..35153926 --- /dev/null +++ b/tests/test_chat.py @@ -0,0 +1,79 @@ +import os +from pathlib import Path +from gptme.chat import _find_potential_paths + + +def test_find_potential_paths(tmp_path): + # Create some test files + (tmp_path / "test.txt").touch() + (tmp_path / "subdir").mkdir() + (tmp_path / "subdir/file.py").touch() + + # Change to temp directory for testing + old_cwd = Path.cwd() + try: + os.chdir(tmp_path) + + # Test various path formats + content = """ + Here are some paths: + /absolute/path + ~/home/path + ./relative/path + test.txt + subdir/file.py + http://example.com + https://example.com/path + + ```python + # This path should be ignored + ignored_path = "/path/in/codeblock" + ``` + + More text with `wrapped/path` and path.with.dots + """ + + paths = _find_potential_paths(content) + + # Check expected paths are found + assert "/absolute/path" in paths + assert "~/home/path" in paths + assert "./relative/path" in paths + assert "test.txt" in paths # exists in tmp_path + assert "subdir/file.py" in paths # exists in tmp_path + assert "http://example.com" in paths + assert "https://example.com/path" in paths + assert "wrapped/path" in paths + + # Check paths in codeblocks are ignored + assert "/path/in/codeblock" not in paths + + # Check non-paths are ignored + assert "path.with.dots" not in paths + + finally: + os.chdir(old_cwd) + + +def test_find_potential_paths_empty(): + # Test with empty content + assert _find_potential_paths("") == [] + + # Test with no paths + assert _find_potential_paths("just some text") == [] + + +def test_find_potential_paths_punctuation(): + # Test paths with trailing punctuation + content = """ + Look at ~/file.txt! + Check /path/to/file? + See ./local/path. + Visit https://example.com, + """ + + paths = _find_potential_paths(content) + assert "~/file.txt" in paths + assert "/path/to/file" in paths + assert "./local/path" in paths + assert "https://example.com" in paths