From c353e4cb733c09bfc68ed307c8adb408d2276d0a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Erik=20Bj=C3=A4reholt?= <erik@bjareho.lt>
Date: Sat, 14 Dec 2024 18:40:28 +0100
Subject: [PATCH] fix: improved path detection in prompts, now works with more
 adjacent punctuation types

---
 gptme/chat.py      | 108 +++++++++++++++++++++++++++++----------------
 tests/test_chat.py |  79 +++++++++++++++++++++++++++++++++
 2 files changed, 148 insertions(+), 39 deletions(-)
 create mode 100644 tests/test_chat.py

diff --git a/gptme/chat.py b/gptme/chat.py
index a47b40d2..ed68a164 100644
--- a/gptme/chat.py
+++ b/gptme/chat.py
@@ -278,6 +278,60 @@ def prompt_input(prompt: str, value=None) -> str:  # pragma: no cover
     return value
 
 
+def _find_potential_paths(content: str) -> list[str]:
+    """
+    Find potential file paths and URLs in a message content.
+    Excludes content within code blocks.
+
+    Args:
+        content: The message content to search
+
+    Returns:
+        List of potential paths/URLs found in the message
+    """
+    # Remove code blocks to avoid matching paths inside them
+    content_no_codeblocks = re.sub(r"```[\s\S]*?```", "", content)
+
+    # List current directory contents for relative path matching
+    cwd_files = [f.name for f in Path.cwd().iterdir()]
+
+    paths = []
+
+    def is_path_like(word: str) -> bool:
+        """Helper to check if a word looks like a path"""
+        return (
+            # Absolute/home/relative paths
+            any(word.startswith(s) for s in ["/", "~/", "./"])
+            # URLs
+            or word.startswith("http")
+            # Contains slash (for backtick-wrapped paths)
+            or "/" in word
+            # Files in current directory or subdirectories
+            or any(word.split("/", 1)[0] == file for file in cwd_files)
+        )
+
+    # First find backtick-wrapped content
+    for match in re.finditer(r"`([^`]+)`", content_no_codeblocks):
+        word = match.group(1).strip()
+        word = word.rstrip("?").rstrip(".").rstrip(",").rstrip("!")
+        if is_path_like(word):
+            paths.append(word)
+
+    # Then find non-backtick-wrapped words
+    # Remove backtick-wrapped content first to avoid double-processing
+    content_no_backticks = re.sub(r"`[^`]+`", "", content_no_codeblocks)
+    for word in re.split(r"\s+", content_no_backticks):
+        word = word.strip()
+        word = word.rstrip("?").rstrip(".").rstrip(",").rstrip("!")
+        if not word:
+            continue
+
+        if is_path_like(word):
+            paths.append(word)
+
+    return paths
+
+
 def _include_paths(msg: Message, workspace: Path | None = None) -> Message:
     """
     Searches the message for any valid paths and:
@@ -296,48 +350,24 @@ def _include_paths(msg: Message, workspace: Path | None = None) -> Message:
     # TODO: add support for directories?
     assert msg.role == "user"
 
-    # list the current directory
-    cwd_files = [f.name for f in Path.cwd().iterdir()]
-
-    # match absolute, home, relative paths, and URLs anywhere in the message
-    # could be wrapped with spaces or backticks, possibly followed by a question mark
-    # don't look in codeblocks, and don't match paths that are already in codeblocks
-    # TODO: this will misbehave if there are codeblocks (or triple backticks) in codeblocks
-    content_no_codeblocks = re.sub(r"```.*?\n```", "", msg.content, flags=re.DOTALL)
-
     append_msg = ""
     files = []
 
-    for word in re.split(r"[\s`]", content_no_codeblocks):
-        # remove wrapping backticks
-        word = word.strip("`")
-        # remove trailing question mark
-        word = word.rstrip("?")
-        if not word:
-            continue
-        if (
-            # if word starts with a path character
-            any(word.startswith(s) for s in ["/", "~/", "./"])
-            # or word is a URL
-            or word.startswith("http")
-            # or word is a file in the current dir,
-            # or a path that starts in a folder in the current dir
-            or any(word.split("/", 1)[0] == file for file in cwd_files)
-        ):
-            logger.debug(f"potential path/url: {word=}")
-            # If not using fresh context, include text file contents in the message
-            if not use_fresh_context and (contents := _parse_prompt(word)):
-                # if we found a valid path, replace it with the contents of the file
-                append_msg += "\n\n" + contents
-            else:
-                # if we found an non-text file, include it in msg.files
-                file = _parse_prompt_files(word)
-                if file:
-                    # Store path relative to workspace if provided
-                    file = file.expanduser()
-                    if workspace and not file.is_absolute():
-                        file = file.absolute().relative_to(workspace)
-                    files.append(file)
+    # Find potential paths in message
+    for word in _find_potential_paths(msg.content):
+        logger.debug(f"potential path/url: {word=}")
+        # If not using fresh context, include text file contents in the message
+        if not use_fresh_context and (contents := _parse_prompt(word)):
+            append_msg += "\n\n" + contents
+        else:
+            # if we found an non-text file, include it in msg.files
+            file = _parse_prompt_files(word)
+            if file:
+                # Store path relative to workspace if provided
+                file = file.expanduser()
+                if workspace and not file.is_absolute():
+                    file = file.absolute().relative_to(workspace)
+                files.append(file)
 
     if files:
         msg = msg.replace(files=msg.files + files)
diff --git a/tests/test_chat.py b/tests/test_chat.py
new file mode 100644
index 00000000..35153926
--- /dev/null
+++ b/tests/test_chat.py
@@ -0,0 +1,79 @@
+import os
+from pathlib import Path
+from gptme.chat import _find_potential_paths
+
+
+def test_find_potential_paths(tmp_path):
+    # Create some test files
+    (tmp_path / "test.txt").touch()
+    (tmp_path / "subdir").mkdir()
+    (tmp_path / "subdir/file.py").touch()
+
+    # Change to temp directory for testing
+    old_cwd = Path.cwd()
+    try:
+        os.chdir(tmp_path)
+
+        # Test various path formats
+        content = """
+        Here are some paths:
+        /absolute/path
+        ~/home/path
+        ./relative/path
+        test.txt
+        subdir/file.py
+        http://example.com
+        https://example.com/path
+
+        ```python
+        # This path should be ignored
+        ignored_path = "/path/in/codeblock"
+        ```
+
+        More text with `wrapped/path` and path.with.dots
+        """
+
+        paths = _find_potential_paths(content)
+
+        # Check expected paths are found
+        assert "/absolute/path" in paths
+        assert "~/home/path" in paths
+        assert "./relative/path" in paths
+        assert "test.txt" in paths  # exists in tmp_path
+        assert "subdir/file.py" in paths  # exists in tmp_path
+        assert "http://example.com" in paths
+        assert "https://example.com/path" in paths
+        assert "wrapped/path" in paths
+
+        # Check paths in codeblocks are ignored
+        assert "/path/in/codeblock" not in paths
+
+        # Check non-paths are ignored
+        assert "path.with.dots" not in paths
+
+    finally:
+        os.chdir(old_cwd)
+
+
+def test_find_potential_paths_empty():
+    # Test with empty content
+    assert _find_potential_paths("") == []
+
+    # Test with no paths
+    assert _find_potential_paths("just some text") == []
+
+
+def test_find_potential_paths_punctuation():
+    # Test paths with trailing punctuation
+    content = """
+    Look at ~/file.txt!
+    Check /path/to/file?
+    See ./local/path.
+    Visit https://example.com,
+    """
+
+    paths = _find_potential_paths(content)
+    assert "~/file.txt" in paths
+    assert "/path/to/file" in paths
+    assert "./local/path" in paths
+    assert "https://example.com" in paths