Skip to content

Commit

Permalink
Fix python vectorizer, prompt and other little things (#191)
Browse files Browse the repository at this point in the history
  • Loading branch information
jfrverdasca authored Jan 24, 2025
1 parent b945b1f commit 64816be
Show file tree
Hide file tree
Showing 11 changed files with 204 additions and 80 deletions.
16 changes: 12 additions & 4 deletions labs/embeddings/vectorizers/chunk_vectorizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,12 +64,20 @@ def split_docs(self, docs):
def vectorize_to_database(self, include_file_extensions, repository_path, *args, **kwargs):
logger.debug("Loading and splitting all documents into chunks.")
docs = self.load_docs(repository_path, include_file_extensions)
texts = self.split_docs(docs)
files_texts = [(text.metadata["source"], text.page_content) for text in texts]
texts = [files_text[1] for files_text in files_texts]
docs_parts = self.split_docs(docs)

logger.debug("Embedding all repository documents.")
texts = []
files_texts = []
for part in docs_parts:
# Add the file path to the content to allow retrieving the file by its path
# if it is mentioned in the prompt.
file_path = part.metadata["source"]
content = f"{file_path}\n\n{part.page_content}"

files_texts.append((file_path, content))
texts.append(content)

logger.debug("Embedding all repository documents.")
embeddings = self.embedder.embed(prompt=texts)

logger.debug("Storing all embeddings.")
Expand Down
66 changes: 31 additions & 35 deletions labs/embeddings/vectorizers/python_vectorizer.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
import logging
import os
from pathlib import Path
from types import SimpleNamespace
from typing import Union

import pathspec
from langchain_community.document_loaders import TextLoader
Expand All @@ -14,24 +16,13 @@ class PythonVectorizer:
def __init__(self, embedder):
self.embedder = embedder

def prepare_doc_content(self, metadata, code_snippet):
metadata = SimpleNamespace(**metadata)
def load_file_chunks(self, file_path: Union[str, Path]):
try:
loader = TextLoader(file_path, encoding="utf-8")
return loader.load_and_split()

result = (
f"Source: {metadata.source}\n"
f"Name: {metadata.name}\n"
f"Start line: {metadata.start_line}\n"
f"End line: {metadata.end_line}\n"
)

if hasattr(metadata, "parameters"):
result += f"Parameters: {', '.join(metadata.parameters)}\n"

if hasattr(metadata, "returns"):
result += f"Returns: {metadata.returns}\n"

result += f"\n\n{code_snippet}"
return result
except Exception:
logger.exception(f"Error loading file {file_path}")

def load_docs(self, root_dir, file_extensions=None):
docs = []
Expand All @@ -58,20 +49,20 @@ def load_docs(self, root_dir, file_extensions=None):
if file_extensions and os.path.splitext(file_path)[1] not in file_extensions:
continue

# only python files
if os.path.splitext(file_path)[1] != ".py":
try:
loader = TextLoader(file_path, encoding="utf-8")
docs.extend(loader.load_and_split())
docs.extend(self.load_file_chunks(file_path))
continue

except Exception:
logger.exception("Failed to load repository documents into memory.")
# Python files with syntax errors will be loaded in chunks
try:
python_file_structure = parse_python_file(file_path)

except SyntaxError as e:
logger.error(f"Syntax error at {file_path}. File will be loaded without Python parsing: {e}")
docs.extend(self.load_file_chunks(file_path))
continue

python_file_structure = parse_python_file(file_path)

# functions
# Functions
for func in python_file_structure.get("functions", []):
func_ns = SimpleNamespace(**func)

Expand All @@ -85,10 +76,9 @@ def load_docs(self, root_dir, file_extensions=None):
returns=func_ns.returns,
)

doc_content = self.prepare_doc_content(metadata, function_snippet)
docs.append(Document(doc_content, metadata=metadata))
docs.append(Document(function_snippet, metadata=metadata))

# classes
# Classes
for cls in python_file_structure.get("classes", []):
cls_ns = SimpleNamespace(**cls)

Expand All @@ -100,8 +90,7 @@ def load_docs(self, root_dir, file_extensions=None):
end_line=cls_ns.end_line,
)

doc_content = self.prepare_doc_content(metadata, class_snippet)
docs.append(Document(doc_content, metadata=metadata))
docs.append(Document(class_snippet, metadata=metadata))

for method in cls.get("methods"):
method_ns = SimpleNamespace(**method)
Expand All @@ -116,17 +105,24 @@ def load_docs(self, root_dir, file_extensions=None):
returns=method_ns.returns,
)

doc_content = self.prepare_doc_content(metadata, method_snippet)
docs.append(Document(doc_content, metadata=metadata))
docs.append(Document(method_snippet, metadata=metadata))

return docs

def vectorize_to_database(self, include_file_extensions, repository_path, *args, **kwargs):
docs = self.load_docs(repository_path, include_file_extensions)

logger.debug(f"Loading {len(docs)} documents...")
files_texts = [(doc.metadata["source"], doc.page_content) for doc in docs]
texts = [file_and_text[1] for file_and_text in files_texts]
texts = []
files_texts = []
for doc in docs:
# Add the file path to the content to allow retrieving the file by its path
# if it is mentioned in the prompt.
file_path = doc.metadata["source"]
content = f"{file_path}\n\n{doc.page_content}"

files_texts.append((file_path, content))
texts.append(content)

embeddings = self.embedder.embed(prompt=texts)

Expand Down
7 changes: 4 additions & 3 deletions labs/file_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,8 +51,9 @@ def modify_file_line(file_path: str, content: Union[str | List[str]], line_numbe

temp_file.write(line)

if line_number > current_line_number:
temp_file.writelines(content)
if not overwrite and line_number > current_line_number:
# If we reach this condition line_number + 1 does not apply, so we add a `\n` before
temp_file.writelines(f"\n{content}")

except Exception as e:
logger.error(f"Error modifying file {file_path}: {e}")
Expand Down Expand Up @@ -85,7 +86,7 @@ def get_file_content(file_path: str) -> str:
content = ""
with open(file_path, "r") as file:
for line_number, line in enumerate(file, start=1):
content += f"{line_number}: {line}"
content += f"{str(line_number).rjust(6)} | {line}"

return content

Expand Down
56 changes: 39 additions & 17 deletions labs/llm/context.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,24 +13,46 @@
"text/x-python": "python",
}

CONTENT_TEMPLATE = "The following is the code in `{file}`:\n\n````{mimetype}\n{content}\n```"
CONTENT_TEMPLATE = "The following is the code in `{file}`:\n\n```{mimetype}\n{content}\n```"
PERSONA_CONTEXT = """
You are an advanced software engineer assistant designed to resolve code-based tasks.
You will receive:
1. A description of the task.
2. File names and their contents as context.
3. Constraints such as not modifying migrations unless explicitly required.
You should:
- Analyze the provided task description and associated context.
- Generate the necessary code changes to resolve the task.
- Ensure adherence to best practices for the programming language used.
- Avoid changes to migrations or unrelated files unless specified.
- Provide clean, organized, and ready-to-review code changes.
- Group related logic together to ensure clarity and cohesion.
- Add meaningful comments to explain non-obvious logic or complex operations.
- Ensure the code integrates seamlessly into the existing structure of the project.
- Perform the 'delete' operations in **reverse line number order** to avoid line shifting.
# Overview
This is a Python repository for Revent, a photo contest API built with Django.
# Guidance
You are an advanced software engineering assistant tasked with resolving code-related tasks. You will receive:
- Task descriptions.
- File names with line numbers and content as context.
- Constraints such as not modifying migrations unless explicitly required.
Your environment is fully set up—no need to install packages.
# Task Guidelines
- Solve problems with minimal, clean, and efficient code.
- Avoid complexity: minimize branching logic, error handling, and unnecessary lines.
# Code Style
- Use Python's standard library/core packages when possible.
- Prioritize readability, maintainability, and computational efficiency.
- Avoid excessive loops, chains, and nested logic.
- Use:
- Explicit imports (no `import *`).
- List comprehensions over loops (but keep them simple).
- f-strings for formatting.
- Type hints for all function signatures.
- Dataclasses for simple classes.
- Avoid:
- Excessive try/except blocks or nested error handling.
- Installing new packages or using make commands unless specified.
# Testing
- Place tests in the `tests` directory.
- Use the `unittest` framework.
- Write unit tests for all functions, covering edge cases and error conditions.
- Aim for 100% test coverage with concise, comprehensive tests.
# Resources
- README.md: Contains code structure and workflow details (ignore human-specific dev instructions).
- Makefile: Useful commands (refer to instructions in this file).
"""


Expand Down
4 changes: 2 additions & 2 deletions labs/llm/prompt.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,9 +39,9 @@
{
"steps": [
{
"type": "Operation type: 'create', 'update', 'overwrite', or 'delete'",
"type": "Operation type: 'create', 'insert', 'overwrite', or 'delete'",
"path": "Absolute file path",
"content": "Content to write (required for 'create', 'update', or 'overwrite')",
"content": "Content to write (required for 'create', 'insert', or 'overwrite')",
"line": "Initial line number where the content should be written (or erased if 'delete')",
}
]
Expand Down
9 changes: 2 additions & 7 deletions labs/parsers/python.py
Original file line number Diff line number Diff line change
Expand Up @@ -209,14 +209,9 @@ def parse_python_file(file_path: str) -> str | dict:
file_content = source.read()

parser = PythonFileParser(file_name=file_path)
try:
tree = ast.parse(file_content, file_path)

except SyntaxError as e:
print(f"Syntax error at {file_path}: {e}")
return dict()

tree = ast.parse(file_content, file_path)
parser.visit(tree)

return parser.get_structure()


Expand Down
5 changes: 2 additions & 3 deletions labs/tasks/llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
import logging
from typing import List, Optional

from config.celery import app
from config.redis_client import RedisVariable, redis_client
from core.models import Model, VectorizerModel
from django.conf import settings
from embeddings.embedder import Embedder
Expand All @@ -11,9 +13,6 @@
from llm.prompt import get_prompt
from llm.requester import Requester

from config.celery import app
from config.redis_client import RedisVariable, redis_client

logger = logging.getLogger(__name__)


Expand Down
3 changes: 1 addition & 2 deletions labs/tasks/logging.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
from core.models import Model, WorkflowResult

from config.celery import app
from config.redis_client import RedisVariable, redis_client
from core.models import Model, WorkflowResult


@app.task
Expand Down
10 changes: 6 additions & 4 deletions labs/tasks/repository.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,13 @@
import logging
from typing import List, cast

from config.celery import app
from config.redis_client import RedisVariable, redis_client
from decorators import time_and_log_function
from file_handler import create_file, delete_file_line, modify_file_line
from github.github import GithubRequests
from parsers.response import parse_llm_output

from config.celery import app
from config.redis_client import RedisVariable, redis_client

logger = logging.getLogger(__name__)


Expand All @@ -26,8 +25,11 @@ def github_repository_data(prefix, token="", repository_owner="", repository_nam
def apply_code_changes(llm_response):
response = parse_llm_output(llm_response)

# We will sort the operations by file and by line number
sorted_steps = sorted(response.steps, key=lambda s: (s.path, s.line), reverse=True)

files: List[str | None] = []
for step in response.steps:
for step in sorted_steps:
match step.type:
case "create":
create_file(step.path, step.content)
Expand Down
5 changes: 2 additions & 3 deletions labs/tasks/run.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
import os.path

from celery import chain
from config.celery import app
from config.redis_client import RedisVariable, redis_client
from tasks import (
apply_code_changes_task,
clone_repository_task,
Expand All @@ -15,9 +17,6 @@
vectorize_repository_task,
)

from config.celery import app
from config.redis_client import RedisVariable, redis_client


@app.task(bind=True)
def init_task(self, **kwargs):
Expand Down
Loading

0 comments on commit 64816be

Please sign in to comment.