Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: improved path detection in prompts, now works with more adjacent punctuation types #333

Merged
merged 1 commit into from
Dec 14, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
108 changes: 69 additions & 39 deletions gptme/chat.py
Original file line number Diff line number Diff line change
Expand Up @@ -278,6 +278,60 @@ def prompt_input(prompt: str, value=None) -> str: # pragma: no cover
return value


def _find_potential_paths(content: str) -> list[str]:
"""
Find potential file paths and URLs in a message content.
Excludes content within code blocks.

Args:
content: The message content to search

Returns:
List of potential paths/URLs found in the message
"""
# Remove code blocks to avoid matching paths inside them
content_no_codeblocks = re.sub(r"```[\s\S]*?```", "", content)

# List current directory contents for relative path matching
cwd_files = [f.name for f in Path.cwd().iterdir()]

paths = []

def is_path_like(word: str) -> bool:
"""Helper to check if a word looks like a path"""
return (
# Absolute/home/relative paths
any(word.startswith(s) for s in ["/", "~/", "./"])
# URLs
or word.startswith("http")
# Contains slash (for backtick-wrapped paths)
or "/" in word
# Files in current directory or subdirectories
or any(word.split("/", 1)[0] == file for file in cwd_files)
)

# First find backtick-wrapped content
for match in re.finditer(r"`([^`]+)`", content_no_codeblocks):
word = match.group(1).strip()
word = word.rstrip("?").rstrip(".").rstrip(",").rstrip("!")
if is_path_like(word):
paths.append(word)

# Then find non-backtick-wrapped words
# Remove backtick-wrapped content first to avoid double-processing
content_no_backticks = re.sub(r"`[^`]+`", "", content_no_codeblocks)
for word in re.split(r"\s+", content_no_backticks):
word = word.strip()
word = word.rstrip("?").rstrip(".").rstrip(",").rstrip("!")
if not word:
continue

if is_path_like(word):
paths.append(word)

return paths


def _include_paths(msg: Message, workspace: Path | None = None) -> Message:
"""
Searches the message for any valid paths and:
Expand All @@ -296,48 +350,24 @@ def _include_paths(msg: Message, workspace: Path | None = None) -> Message:
# TODO: add support for directories?
assert msg.role == "user"

# list the current directory
cwd_files = [f.name for f in Path.cwd().iterdir()]

# match absolute, home, relative paths, and URLs anywhere in the message
# could be wrapped with spaces or backticks, possibly followed by a question mark
# don't look in codeblocks, and don't match paths that are already in codeblocks
# TODO: this will misbehave if there are codeblocks (or triple backticks) in codeblocks
content_no_codeblocks = re.sub(r"```.*?\n```", "", msg.content, flags=re.DOTALL)

append_msg = ""
files = []

for word in re.split(r"[\s`]", content_no_codeblocks):
# remove wrapping backticks
word = word.strip("`")
# remove trailing question mark
word = word.rstrip("?")
if not word:
continue
if (
# if word starts with a path character
any(word.startswith(s) for s in ["/", "~/", "./"])
# or word is a URL
or word.startswith("http")
# or word is a file in the current dir,
# or a path that starts in a folder in the current dir
or any(word.split("/", 1)[0] == file for file in cwd_files)
):
logger.debug(f"potential path/url: {word=}")
# If not using fresh context, include text file contents in the message
if not use_fresh_context and (contents := _parse_prompt(word)):
# if we found a valid path, replace it with the contents of the file
append_msg += "\n\n" + contents
else:
# if we found an non-text file, include it in msg.files
file = _parse_prompt_files(word)
if file:
# Store path relative to workspace if provided
file = file.expanduser()
if workspace and not file.is_absolute():
file = file.absolute().relative_to(workspace)
files.append(file)
# Find potential paths in message
for word in _find_potential_paths(msg.content):
logger.debug(f"potential path/url: {word=}")
# If not using fresh context, include text file contents in the message
if not use_fresh_context and (contents := _parse_prompt(word)):
append_msg += "\n\n" + contents
else:
# if we found an non-text file, include it in msg.files
file = _parse_prompt_files(word)
if file:
# Store path relative to workspace if provided
file = file.expanduser()
if workspace and not file.is_absolute():
file = file.absolute().relative_to(workspace)
files.append(file)

if files:
msg = msg.replace(files=msg.files + files)
Expand Down
79 changes: 79 additions & 0 deletions tests/test_chat.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
import os
from pathlib import Path
from gptme.chat import _find_potential_paths


def test_find_potential_paths(tmp_path):
# Create some test files
(tmp_path / "test.txt").touch()
(tmp_path / "subdir").mkdir()
(tmp_path / "subdir/file.py").touch()

# Change to temp directory for testing
old_cwd = Path.cwd()
try:
os.chdir(tmp_path)

# Test various path formats
content = """
Here are some paths:
/absolute/path
~/home/path
./relative/path
test.txt
subdir/file.py
http://example.com
https://example.com/path

```python
# This path should be ignored
ignored_path = "/path/in/codeblock"
```

More text with `wrapped/path` and path.with.dots
"""

paths = _find_potential_paths(content)

# Check expected paths are found
assert "/absolute/path" in paths
assert "~/home/path" in paths
assert "./relative/path" in paths
assert "test.txt" in paths # exists in tmp_path
assert "subdir/file.py" in paths # exists in tmp_path
assert "http://example.com" in paths
assert "https://example.com/path" in paths
assert "wrapped/path" in paths

# Check paths in codeblocks are ignored
assert "/path/in/codeblock" not in paths

# Check non-paths are ignored
assert "path.with.dots" not in paths

finally:
os.chdir(old_cwd)


def test_find_potential_paths_empty():
# Test with empty content
assert _find_potential_paths("") == []

# Test with no paths
assert _find_potential_paths("just some text") == []


def test_find_potential_paths_punctuation():
# Test paths with trailing punctuation
content = """
Look at ~/file.txt!
Check /path/to/file?
See ./local/path.
Visit https://example.com,
"""

paths = _find_potential_paths(content)
assert "~/file.txt" in paths
assert "/path/to/file" in paths
assert "./local/path" in paths
assert "https://example.com" in paths
Loading