runtimerevolution · jfrverdasca · Oct 24, 2024 · Oct 7, 2024 · Oct 7, 2024 · Oct 8, 2024
diff --git a/.env.sample b/.env.sample
@@ -37,4 +37,4 @@ REDIS_HOST=redis
 REDIS_PORT=6379
 
 LOCAL_LLM=False
-LOCAL_LLM_HOST=http://localhost:11434
+LOCAL_LLM_HOST=http://localhost:11434
diff --git a/README.md b/README.md
@@ -38,3 +38,9 @@ Here's the steps to setup the project locally:
 3. `make up`
 4. `make api` or `ENV=local make api`
 5. `ENV=test make tests`
+
+
+## Approaches, Tests Conducted and Results
+
+For details on the approaches and tests conducted on issues encountered during development, 
+refer to the document [here](./doc/results.md)
diff --git a/doc/diagram/class_diagram_script.py b/doc/diagram/class_diagram_script.py
@@ -28,9 +28,7 @@ def extract_classes(file_path):
                             args.append(f"{arg.arg}: {arg.annotation.id}")
                         elif isinstance(arg.annotation, ast.Subscript):
                             if isinstance(arg.annotation.value, ast.Name):
-                                args.append(
-                                    f"{arg.arg}: {arg.annotation.value.id}[{arg.annotation.slice.value.id}]"
-                                )
+                                args.append(f"{arg.arg}: {arg.annotation.value.id}[{arg.annotation.slice.value.id}]")
                         else:
                             args.append(arg.arg)
                     method_signature = f"{visibility} {method_name}({', '.join(args)})"

diff --git a/doc/results.md b/doc/results.md
@@ -0,0 +1,59 @@
+# Project tests results
+
+This document aims to document some of the approaches tested and the discoveries made on various problems, 
+as well as the results achieved from the tests conducted.
+
+
+## Embeddins
+
+During the course of the project, various approaches were taken regarding how embeddings are generated, namely:
+
+* Splitting the files into predefined-sized chunks (**File chunks**);
+* Splitting the files according to the structure of the Python code they contain (**Python code structured**).
+
+For more details on the implementation and the test results, refer to the Jupyter Notebook [here](../notebooks/embeddings.ipynb).
+
+### Test results
+
+The following table consisely presents the results obtained where:
+* **Embeddins model**: indicates the model used to generate the embeddings;
+* **LLM**: indicates the LLM used, which can be either a Local LLM or via an external API (like OpenAI);
+* **Embeddins creation method**: indicates the method used to create embeddins, as previously described;
+* **Response quality**: indicates the quality of the LLM's response, which can be:
+  * **Very poor**: The code contains critical issues such as syntax errors, missing imports or broken logic 
+  that prevent it from running;
+  * **Poor**: The code runs but fails to achieve the intended result. It contains logical error or incorrect 
+  implementation of functions, and often throws runtime errors;
+  * **Average**: The code is mostly functional, but there are some errors or edge cases where it fails. 
+  It produces the desired result in many scenarios but may have inefficiencies, unhandled exceptions, 
+  or inconsistent behavior in certain conditions;
+  * **Good**: The code works as expected to the majority of the cases and handles most inputs correctly. 
+  It follows good practices and has few, if any, logical errors but might lack optimization or robustness in edge cases;
+  * **Excellent**: The code is flawless and functions exactly as expected across all scenarios.
+  It is well-structured, optimized and follows Python best practices;
+* **Prompt**: indicates the prompt used in the LLM.
+
+
+| Embedding model                                                                               | LLM (Local/API) | Embeddins creation method | Response quality | Prompt   |
+|-----------------------------------------------------------------------------------------------|-----------------|---------------------------|------------------|----------|
+| [text-embedding-ada-002](https://platform.openai.com/docs/guides/embeddings/embedding-models) | OpenAI API      | File chunks               | Poor             | Prompt 1 |
+| [text-embedding-ada-002](https://platform.openai.com/docs/guides/embeddings/embedding-models) | OpenAI API      | Python code structure     | Poor             | Prompt 1 |
+
+
+Below is the prompt used in the LLM for the test conducted:
+
+#### Prompt 1:
+> You're a diligent software engineer AI. You can't see, draw, or interact with a 
+> browser, but you can read and write files, and you can think. 
+> You've been given the following task: {issue_summary}. 
+> Any imports will be at the beggining of the file. 
+> Add tests for the new functionalities, considering any existing test files. 
+> The file paths provided are **absolute paths relative to the project root**, 
+> and **must not be changed**. Ensure the paths you output match the paths provided exactly. 
+> Do not prepend or modify the paths. 
+> Please provide a json response in the following format: {{"steps": [...]}} 
+> Where steps is a list of objects where each object contains three fields:
+> type, which is either 'create' to add a new file or 'modify' to edit an existing one; 
+> If the file is to be modified send the finished version of the entire file. 
+> path, which is the absolute path of the file to create/modify; 
+> content, which is the content to write to the file.
diff --git a/labs/database/vectorize.py b/labs/database/vectorize.py
diff --git a/labs/database/vectorize/__init__.py b/labs/database/vectorize/__init__.py
@@ -0,0 +1,6 @@
+from abc import ABC, abstractmethod
+
+
+class Vectorizer(ABC):
+    @abstractmethod
+    def vectorize_to_database(self, include_file_extensions, repo_destination): ...
diff --git a/labs/database/vectorize/chunk_vectorizer.py b/labs/database/vectorize/chunk_vectorizer.py
@@ -0,0 +1,82 @@
+from litellm import embedding
+import openai
+import os
+import pathspec
+from langchain_community.document_loaders import TextLoader
+from langchain.text_splitter import CharacterTextSplitter
+
+from labs.database.vectorize import Vectorizer
+
+import logging
+
+from labs.config import settings
+from labs.database.embeddings import reembed_code
+
+
+logger = logging.getLogger(__name__)
+
+openai.api_key = settings.OPENAI_API_KEY
+
+
+class ChunkVectorizer(Vectorizer):
+    def load_docs(self, root_dir, file_extensions=None):
+        """
+        Load documents from the specified root directory.
+        Ignore dotfiles, dot directories, and files that match .gitignore rules.
+        Optionally filter by file extensions.
+        """
+        docs = []
+
+        # Load .gitignore rules
+        gitignore_path = os.path.join(root_dir, ".gitignore")
+
+        if os.path.isfile(gitignore_path):
+            with open(gitignore_path, "r") as gitignore_file:
+                gitignore = gitignore_file.read()
+            spec = pathspec.PathSpec.from_lines(pathspec.patterns.GitWildMatchPattern, gitignore.splitlines())
+        else:
+            spec = None
+
+        for dirpath, dirnames, filenames in os.walk(root_dir):
+            # Remove dot directories from the list of directory names
+            dirnames[:] = [d for d in dirnames if not d.startswith(".")]
+
+            for file in filenames:
+                file_path = os.path.join(dirpath, file)
+
+                if file.startswith("."):
+                    continue
+                if file.endswith(".lock"):
+                    continue
+
+                # Skip files that match .gitignore rules
+                if spec and spec.match_file(file_path):
+                    continue
+
+                if file_extensions and os.path.splitext(file)[1] not in file_extensions:
+                    continue
+
+                try:
+                    loader = TextLoader(file_path, encoding="utf-8")
+                    docs.extend(loader.load_and_split())
+                except Exception:
+                    logger.exception("Failed to load repo documents into memory.")
+        return docs
+
+    def split_docs(self, docs):
+        """Split the input documents into smaller chunks."""
+        text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
+        return text_splitter.split_documents(docs)
+
+    def vectorize_to_database(self, include_file_extensions, repo_destination):
+        logger.debug("Loading and splitting all documents into chunks.")
+        docs = self.load_docs(repo_destination, include_file_extensions)
+        texts = self.split_docs(docs)
+        files_and_texts = [(text.metadata["source"], text.page_content) for text in texts]
+        texts = [file_and_text[1] for file_and_text in files_and_texts]
+
+        logger.debug("Embedding all repo documents.")
+        embeddings = embedding(model="text-embedding-ada-002", input=texts)
+
+        logger.debug("Storing all embeddings.")
+        reembed_code(files_and_texts, embeddings)
diff --git a/labs/database/vectorize/python_vectorizer.py b/labs/database/vectorize/python_vectorizer.py
@@ -0,0 +1,137 @@
+from types import SimpleNamespace
+
+from langchain_community.document_loaders import TextLoader
+from langchain_core.documents import Document
+from litellm import embedding
+import openai
+import os
+import pathspec
+
+import logging
+
+from labs.config import settings
+from labs.database.embeddings import reembed_code
+
+from labs.parsers.python import get_lines_code, parse_python_file
+from labs.database.vectorize import Vectorizer
+
+logger = logging.getLogger(__name__)
+
+openai.api_key = settings.OPENAI_API_KEY
+
+
+class PythonVectorizer(Vectorizer):
+    def prepare_doc_content(self, metadata, code_snippet):
+        metadata = SimpleNamespace(**metadata)
+
+        result = (
+            f"Source: {metadata.source}\n"
+            f"Name: {metadata.name}\n"
+            f"Start line: {metadata.start_line}\n"
+            f"End line: {metadata.end_line}\n"
+        )
+
+        if hasattr(metadata, "parameters"):
+            result += f"Parameters: {', '.join(metadata.parameters)}\n"
+
+        if hasattr(metadata, "returns"):
+            result += f"Returns: {metadata.returns}\n"
+
+        result += f"\n\n{code_snippet}"
+        return result
+
+    def load_docs(self, root_dir, file_extensions=None):
+        docs = []
+
+        gitignore_path = os.path.join(root_dir, ".gitignore")
+        if os.path.isfile(gitignore_path):
+            with open(gitignore_path, "r") as gitignore_file:
+                gitignore = gitignore_file.read()
+            spec = pathspec.PathSpec.from_lines(pathspec.patterns.GitWildMatchPattern, gitignore.splitlines())
+
+        else:
+            spec = None
+
+        for dirpath, dirnames, filenames in os.walk(root_dir):
+            dirnames[:] = [d for d in dirnames if not d.startswith(".")]
+            for file in filenames:
+                file_path = os.path.join(dirpath, file)
+                if file.startswith(".") or file.endswith(".lock"):
+                    continue
+
+                if spec and spec.match_file(file_path):
+                    continue
+
+                if file_extensions and os.path.splitext(file_path)[1] not in file_extensions:
+                    continue
+
+                # only python files
+                if os.path.splitext(file_path)[1] != ".py":
+                    try:
+                        loader = TextLoader(file_path, encoding="utf-8")
+                        docs.extend(loader.load_and_split())
+
+                    except Exception:
+                        logger.exception("Failed to load repo documents into memory.")
+
+                    continue
+
+                python_file_structure = parse_python_file(file_path)
+
+                # functions
+                for func in python_file_structure.get("functions", []):
+                    func_ns = SimpleNamespace(**func)
+
+                    function_snippet = get_lines_code(file_path, func_ns.start_line, func_ns.end_line)
+                    metadata = dict(
+                        source=file_path,
+                        name=func_ns.name,
+                        start_line=func_ns.start_line,
+                        end_line=func_ns.end_line,
+                        parameters=func_ns.parameters,
+                        returns=func_ns.returns,
+                    )
+
+                    doc_content = self.prepare_doc_content(metadata, function_snippet)
+                    docs.append(Document(doc_content, metadata=metadata))
+
+                # classes
+                for cls in python_file_structure.get("classes", []):
+                    cls_ns = SimpleNamespace(**cls)
+
+                    class_snippet = get_lines_code(file_path, cls_ns.start_line, cls_ns.end_line)
+                    metadata = dict(
+                        source=file_path, name=cls_ns.name, start_line=cls_ns.start_line, end_line=cls_ns.end_line
+                    )
+
+                    doc_content = self.prepare_doc_content(metadata, class_snippet)
+                    docs.append(Document(doc_content, metadata=metadata))
+
+                    for method in cls.get("methods"):
+                        method_ns = SimpleNamespace(**method)
+
+                        method_snippet = get_lines_code(file_path, method_ns.start_line, method_ns.end_line)
+                        metadata = dict(
+                            source=file_path,
+                            name=method_ns.name,
+                            start_line=method_ns.start_line,
+                            end_line=method_ns.end_line,
+                            parameters=method_ns.parameters,
+                            returns=method_ns.returns,
+                        )
+
+                        doc_content = self.prepare_doc_content(metadata, method_snippet)
+                        docs.append(Document(doc_content, metadata=metadata))
+
+        return docs
+
+    def vectorize_to_database(self, include_file_extensions, repo_destination):
+        docs = self.load_docs(repo_destination, include_file_extensions)
+
+        logger.debug(f"Loading {len(docs)} documents...")
+
+        for doc in docs:
+            embeddings = embedding(model="text-embedding-ada-002", input=doc)
+
+            logger.debug("Storing embeddins...")
+            reembed_code([(doc.metadata["source"], doc.page_content)], embeddings)