Skip to content

Commit

Permalink
fix: improved path detection in prompts, now works with more adjacent…
Browse files Browse the repository at this point in the history
… punctuation types
  • Loading branch information
ErikBjare committed Dec 14, 2024
1 parent d598ff3 commit c353e4c
Show file tree
Hide file tree
Showing 2 changed files with 148 additions and 39 deletions.
108 changes: 69 additions & 39 deletions gptme/chat.py
Original file line number Diff line number Diff line change
Expand Up @@ -278,6 +278,60 @@ def prompt_input(prompt: str, value=None) -> str: # pragma: no cover
return value


def _find_potential_paths(content: str) -> list[str]:
"""
Find potential file paths and URLs in a message content.
Excludes content within code blocks.
Args:
content: The message content to search
Returns:
List of potential paths/URLs found in the message
"""
# Remove code blocks to avoid matching paths inside them
content_no_codeblocks = re.sub(r"```[\s\S]*?```", "", content)

# List current directory contents for relative path matching
cwd_files = [f.name for f in Path.cwd().iterdir()]

paths = []

def is_path_like(word: str) -> bool:
"""Helper to check if a word looks like a path"""
return (
# Absolute/home/relative paths
any(word.startswith(s) for s in ["/", "~/", "./"])
# URLs
or word.startswith("http")
# Contains slash (for backtick-wrapped paths)
or "/" in word
# Files in current directory or subdirectories
or any(word.split("/", 1)[0] == file for file in cwd_files)
)

# First find backtick-wrapped content
for match in re.finditer(r"`([^`]+)`", content_no_codeblocks):
word = match.group(1).strip()
word = word.rstrip("?").rstrip(".").rstrip(",").rstrip("!")
if is_path_like(word):
paths.append(word)

# Then find non-backtick-wrapped words
# Remove backtick-wrapped content first to avoid double-processing
content_no_backticks = re.sub(r"`[^`]+`", "", content_no_codeblocks)
for word in re.split(r"\s+", content_no_backticks):
word = word.strip()
word = word.rstrip("?").rstrip(".").rstrip(",").rstrip("!")
if not word:
continue

if is_path_like(word):
paths.append(word)

return paths


def _include_paths(msg: Message, workspace: Path | None = None) -> Message:
"""
Searches the message for any valid paths and:
Expand All @@ -296,48 +350,24 @@ def _include_paths(msg: Message, workspace: Path | None = None) -> Message:
# TODO: add support for directories?
assert msg.role == "user"

# list the current directory
cwd_files = [f.name for f in Path.cwd().iterdir()]

# match absolute, home, relative paths, and URLs anywhere in the message
# could be wrapped with spaces or backticks, possibly followed by a question mark
# don't look in codeblocks, and don't match paths that are already in codeblocks
# TODO: this will misbehave if there are codeblocks (or triple backticks) in codeblocks
content_no_codeblocks = re.sub(r"```.*?\n```", "", msg.content, flags=re.DOTALL)

append_msg = ""
files = []

for word in re.split(r"[\s`]", content_no_codeblocks):
# remove wrapping backticks
word = word.strip("`")
# remove trailing question mark
word = word.rstrip("?")
if not word:
continue
if (
# if word starts with a path character
any(word.startswith(s) for s in ["/", "~/", "./"])
# or word is a URL
or word.startswith("http")
# or word is a file in the current dir,
# or a path that starts in a folder in the current dir
or any(word.split("/", 1)[0] == file for file in cwd_files)
):
logger.debug(f"potential path/url: {word=}")
# If not using fresh context, include text file contents in the message
if not use_fresh_context and (contents := _parse_prompt(word)):
# if we found a valid path, replace it with the contents of the file
append_msg += "\n\n" + contents
else:
# if we found an non-text file, include it in msg.files
file = _parse_prompt_files(word)
if file:
# Store path relative to workspace if provided
file = file.expanduser()
if workspace and not file.is_absolute():
file = file.absolute().relative_to(workspace)
files.append(file)
# Find potential paths in message
for word in _find_potential_paths(msg.content):
logger.debug(f"potential path/url: {word=}")
# If not using fresh context, include text file contents in the message
if not use_fresh_context and (contents := _parse_prompt(word)):
append_msg += "\n\n" + contents
else:
# if we found an non-text file, include it in msg.files
file = _parse_prompt_files(word)
if file:
# Store path relative to workspace if provided
file = file.expanduser()
if workspace and not file.is_absolute():
file = file.absolute().relative_to(workspace)
files.append(file)

if files:
msg = msg.replace(files=msg.files + files)
Expand Down
79 changes: 79 additions & 0 deletions tests/test_chat.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
import os
from pathlib import Path
from gptme.chat import _find_potential_paths


def test_find_potential_paths(tmp_path):
# Create some test files
(tmp_path / "test.txt").touch()
(tmp_path / "subdir").mkdir()
(tmp_path / "subdir/file.py").touch()

# Change to temp directory for testing
old_cwd = Path.cwd()
try:
os.chdir(tmp_path)

# Test various path formats
content = """
Here are some paths:
/absolute/path
~/home/path
./relative/path
test.txt
subdir/file.py
http://example.com
https://example.com/path
```python
# This path should be ignored
ignored_path = "/path/in/codeblock"
```
More text with `wrapped/path` and path.with.dots
"""

paths = _find_potential_paths(content)

# Check expected paths are found
assert "/absolute/path" in paths
assert "~/home/path" in paths
assert "./relative/path" in paths
assert "test.txt" in paths # exists in tmp_path
assert "subdir/file.py" in paths # exists in tmp_path
assert "http://example.com" in paths
assert "https://example.com/path" in paths
assert "wrapped/path" in paths

# Check paths in codeblocks are ignored
assert "/path/in/codeblock" not in paths

# Check non-paths are ignored
assert "path.with.dots" not in paths

finally:
os.chdir(old_cwd)


def test_find_potential_paths_empty():
# Test with empty content
assert _find_potential_paths("") == []

# Test with no paths
assert _find_potential_paths("just some text") == []


def test_find_potential_paths_punctuation():
# Test paths with trailing punctuation
content = """
Look at ~/file.txt!
Check /path/to/file?
See ./local/path.
Visit https://example.com,
"""

paths = _find_potential_paths(content)
assert "~/file.txt" in paths
assert "/path/to/file" in paths
assert "./local/path" in paths
assert "https://example.com" in paths

0 comments on commit c353e4c

Please sign in to comment.