From 2a7c6339d602cd5afef7521272c98380c06a6ba9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jo=C3=A3o=20Verdasca?= Date: Tue, 10 Dec 2024 15:03:31 +0000 Subject: [PATCH 1/3] Fix: Refactored configuration_variables.py and logger.py to Django logging --- Makefile | 2 +- labs/api/schemas/codemonkey.py | 5 ++- labs/config/celery.py | 2 +- labs/config/configuration_variables.py | 25 ----------- labs/config/logger.py | 19 ++++++++ labs/config/settings.py | 61 ++++++++++++++++++++++++++ labs/embeddings/embedder.py | 9 +++- labs/embeddings/ollama.py | 4 +- labs/github/github.py | 2 +- labs/llm/ollama.py | 4 +- labs/logger.py | 49 --------------------- labs/tasks/llm.py | 22 ++++++++-- labs/tasks/repository.py | 2 +- labs/tasks/run.py | 2 +- 14 files changed, 118 insertions(+), 90 deletions(-) delete mode 100644 labs/config/configuration_variables.py create mode 100644 labs/config/logger.py delete mode 100644 labs/logger.py diff --git a/Makefile b/Makefile index 9833da9..9803a50 100644 --- a/Makefile +++ b/Makefile @@ -93,7 +93,7 @@ migrate: poetry run python labs/manage.py migrate createuser: - DJANGO_SUPERUSER_PASSWORD=admin poetry run python labs/manage.py createsuperuser --noinput --username=admin --email=a@b.com + DJANGO_SUPERUSER_PASSWORD=admin poetry run python labs/manage.py createsuperuser --noinput --username=admin --email=admin@example.com load_fixtures: python labs/manage.py loaddata $(wildcard labs/fixtures/*.json) diff --git a/labs/api/schemas/codemonkey.py b/labs/api/schemas/codemonkey.py index 2b6a3e9..057908a 100644 --- a/labs/api/schemas/codemonkey.py +++ b/labs/api/schemas/codemonkey.py @@ -1,6 +1,7 @@ from typing import List, Optional from api.schemas.github import GithubSchema +from django.conf import settings from pydantic import BaseModel @@ -18,7 +19,9 @@ class VectorizeRepositorySchema(BaseModel): repository_path: str -class FindEmbeddingsSchema(LocalRepositoryShema): ... +class FindEmbeddingsSchema(LocalRepositoryShema): + similarity_threshold: float = settings.EMBEDDINGS_SIMILARITY_TRESHOLD + max_results: int = settings.EMBEDDINGS_MAX_RESULTS class PreparePromptContextSchema(BaseModel): diff --git a/labs/config/celery.py b/labs/config/celery.py index 9a201c7..f88d84b 100644 --- a/labs/config/celery.py +++ b/labs/config/celery.py @@ -1,10 +1,10 @@ import logging import os -import config.configuration_variables as settings import redis from celery import Celery from celery.signals import task_failure +from django.conf import settings from kombu import Queue from redbeat import RedBeatSchedulerEntry, schedulers diff --git a/labs/config/configuration_variables.py b/labs/config/configuration_variables.py deleted file mode 100644 index 36aeccd..0000000 --- a/labs/config/configuration_variables.py +++ /dev/null @@ -1,25 +0,0 @@ -import os - -from logger import setup_logger - -setup_logger() - - -GITHUB_API_BASE_URL = "https://api.github.com" - -CLONE_DESTINATION_DIR = os.getenv("CLONE_DESTINATION_DIR", "/tmp/") - -DATABASE_USER = os.environ.get("DATABASE_USER", "postgres") -DATABASE_PASS = os.environ.get("DATABASE_PASS", "postgres") -DATABASE_HOST = os.environ.get("DATABASE_HOST", "localhost") -DATABASE_PORT = os.environ.get("DATABASE_PORT", "5432") -DATABASE_NAME = os.environ.get("DATABASE_NAME", "postgres") -DATABASE_URL = f"postgresql://{DATABASE_USER}:{DATABASE_PASS}@{DATABASE_HOST}:{DATABASE_PORT}/{DATABASE_NAME}" - -CELERY_BROKER_URL = os.environ.get("CELERY_BROKER_URL") -CELERY_BACKEND_URL = os.environ.get("CELERY_BACKEND_URL") - -REDIS_HOST = os.environ.get("REDIS_HOST") -REDIS_PORT = os.environ.get("REDIS_PORT") - -LOCAL_LLM_HOST = os.environ.get("LOCAL_LLM_HOST", "http://ollama:11434") diff --git a/labs/config/logger.py b/labs/config/logger.py new file mode 100644 index 0000000..b28cef4 --- /dev/null +++ b/labs/config/logger.py @@ -0,0 +1,19 @@ +from datetime import datetime + +from django.conf import settings +from pythonjsonlogger import jsonlogger + + +class CustomJsonFormatter(jsonlogger.JsonFormatter): + def add_fields(self, log_record, record, message_dict): + super(CustomJsonFormatter, self).add_fields(log_record, record, message_dict) + + if not log_record.get("timestamp"): + log_record["timestamp"] = datetime.now().strftime(settings.LOGGING_DATETIME_FORMAT) + + if log_record.get("level"): + log_record["level"] = log_record["level"].upper() + else: + log_record["level"] = record.levelname + + log_record["project"] = "codemonkey" diff --git a/labs/config/settings.py b/labs/config/settings.py index 8f73602..346c5f2 100644 --- a/labs/config/settings.py +++ b/labs/config/settings.py @@ -87,6 +87,50 @@ } +# Logging +def create_logging_directory() -> Path: + logs_path = BASE_DIR.parent / "logs" + if not os.path.exists(logs_path): + os.makedirs(logs_path) + + return logs_path / "debug.log" + + +LOGGING_DATETIME_FORMAT = "%Y-%m-%d %H:%M:%S,%f" +LOGGING = { + "version": 1, + "disable_existing_loggers": False, + "formatters": { + "standard": { + "format": "[%(asctime)s][%(levelname)s][%(name)s]: %(message)s", + "datefmt": LOGGING_DATETIME_FORMAT, + }, + "json": {"()": "config.logger.CustomJsonFormatter"}, + }, + "handlers": { + "console": { + "level": "DEBUG", + "class": "logging.StreamHandler", + "formatter": "standard", + }, + "file": { + "level": "DEBUG", + "class": "logging.handlers.RotatingFileHandler", + "filename": create_logging_directory(), + "maxBytes": 10000000, + "backupCount": 5, + "formatter": "json", + }, + }, + "loggers": { + "labs": { + "level": "DEBUG", + "handlers": ["console", "file"], + "propagate": False, + } + }, +} + # Password validation # https://docs.djangoproject.com/en/5.1/ref/settings/#auth-password-validators @@ -127,3 +171,20 @@ # https://docs.djangoproject.com/en/5.1/ref/settings/#default-auto-field DEFAULT_AUTO_FIELD = "django.db.models.BigAutoField" + +# Celery settings +CELERY_BROKER_URL = os.environ.get("CELERY_BROKER_URL") +CELERY_BACKEND_URL = os.environ.get("CELERY_BACKEND_URL") + +# Redis settings +REDIS_HOST = os.environ.get("REDIS_HOST") +REDIS_PORT = os.environ.get("REDIS_PORT") + +# Custom settings +GITHUB_API_BASE_URL = "https://api.github.com" + +LOCAL_LLM_HOST = os.environ.get("LOCAL_LLM_HOST", "http://ollama:11434") + +CLONE_DESTINATION_DIR = os.getenv("CLONE_DESTINATION_DIR", "/tmp/") +EMBEDDINGS_SIMILARITY_TRESHOLD = 0.7 +EMBEDDINGS_MAX_RESULTS = 10 diff --git a/labs/embeddings/embedder.py b/labs/embeddings/embedder.py index 08a664c..533afe6 100644 --- a/labs/embeddings/embedder.py +++ b/labs/embeddings/embedder.py @@ -1,6 +1,7 @@ from dataclasses import dataclass from typing import Any, Dict, List, Optional, Union +from django.conf import settings from embeddings.models import Embedding from pgvector.django import CosineDistance @@ -20,7 +21,11 @@ def embed(self, prompt, *args, **kwargs) -> Embeddings: return self.embedder.embed(prompt, *args, **kwargs) def retrieve_embeddings( - self, query: str, repository: str, similarity_threshold: int = 0.7, number_of_results: int = 10 + self, + query: str, + repository: str, + similarity_threshold: float = settings.EMBEDDINGS_SIMILARITY_TRESHOLD, + max_results: int = settings.EMBEDDINGS_MAX_RESULTS, ) -> List[Embedding]: query = query.replace("\n", "") embedded_query = self.embed(prompt=query).embeddings @@ -29,7 +34,7 @@ def retrieve_embeddings( return Embedding.objects.annotate(distance=CosineDistance("embedding", embedded_query[0])).filter( repository=repository, distance__lt=similarity_threshold - )[:number_of_results] + )[:max_results] def reembed_code( self, diff --git a/labs/embeddings/ollama.py b/labs/embeddings/ollama.py index caee400..bc5f6ff 100644 --- a/labs/embeddings/ollama.py +++ b/labs/embeddings/ollama.py @@ -1,4 +1,4 @@ -from config.configuration_variables import LOCAL_LLM_HOST +from django.conf import settings from embeddings.embedder import Embeddings from ollama import Client @@ -7,7 +7,7 @@ class OllamaEmbedder: def __init__(self, model): self._model_name = model - self._client = Client(LOCAL_LLM_HOST) + self._client = Client(settings.LOCAL_LLM_HOST) def embed(self, prompt, *args, **kwargs) -> Embeddings: result = self._client.embed(self._model_name, prompt, *args, **kwargs) diff --git a/labs/github/github.py b/labs/github/github.py index 98f3f6c..2521bb1 100644 --- a/labs/github/github.py +++ b/labs/github/github.py @@ -3,9 +3,9 @@ import os from dataclasses import dataclass -import config.configuration_variables as settings import git import requests +from django.conf import settings logger = logging.getLogger(__name__) diff --git a/labs/llm/ollama.py b/labs/llm/ollama.py index 2afcf99..0d8df62 100644 --- a/labs/llm/ollama.py +++ b/labs/llm/ollama.py @@ -1,4 +1,4 @@ -from config.configuration_variables import LOCAL_LLM_HOST +from django.conf import settings from ollama import Client @@ -6,7 +6,7 @@ class OllamaRequester: def __init__(self, model): self._model_name = model - self._client = Client(LOCAL_LLM_HOST) + self._client = Client(settings.LOCAL_LLM_HOST) def completion_without_proxy(self, messages, *args, **kwargs): """ diff --git a/labs/logger.py b/labs/logger.py deleted file mode 100644 index 476aabe..0000000 --- a/labs/logger.py +++ /dev/null @@ -1,49 +0,0 @@ -import logging -from datetime import datetime -from logging.handlers import RotatingFileHandler - -from pythonjsonlogger import jsonlogger - -DEFAULT_MAX_BYTES = 10000000 -DEFAULT_BACKUP_COUNT = 5 -LOG_FORMAT = "%Y-%m-%d %H:%M:%S,%f" - - -class CustomJsonFormatter(jsonlogger.JsonFormatter): - def add_fields(self, log_record, record, message_dict): - super(CustomJsonFormatter, self).add_fields(log_record, record, message_dict) - - if not log_record.get("timestamp"): - log_record["timestamp"] = datetime.now().strftime(LOG_FORMAT) - - if log_record.get("level"): - log_record["level"] = log_record["level"].upper() - else: - log_record["level"] = record.levelname - - log_record["project"] = "codemonkey" - - -def setup_logger(): - logging.basicConfig(level=logging.DEBUG, datefmt=LOG_FORMAT) - logger = logging.getLogger("labs") - logger.propagate = False - - log_format = "[%(asctime)s][%(levelname)s][%(name)s]: %(message)s" - formatter = logging.Formatter(fmt=log_format, datefmt=LOG_FORMAT) - stream_handler = logging.StreamHandler() - stream_handler.setFormatter(formatter) - logger.addHandler(stream_handler) - - try: - formatter = CustomJsonFormatter() - handler = RotatingFileHandler( - "logs/debug.log", - maxBytes=DEFAULT_MAX_BYTES, - backupCount=DEFAULT_BACKUP_COUNT, - ) - handler.setFormatter(formatter) - logger.addHandler(handler) - - except Exception: - pass diff --git a/labs/tasks/llm.py b/labs/tasks/llm.py index d075e71..e1b4b50 100644 --- a/labs/tasks/llm.py +++ b/labs/tasks/llm.py @@ -1,9 +1,9 @@ import json import logging -import config.configuration_variables as settings from config.celery import app from core.models import Model, VectorizerModel +from django.conf import settings from embeddings.embedder import Embedder from embeddings.vectorizers.vectorizer import Vectorizer from llm.requester import Requester @@ -85,11 +85,19 @@ def vectorize_repository_task(prefix="", repository_path=""): @app.task -def find_embeddings_task(prefix="", issue_body="", repository_path=""): +def find_embeddings_task( + prefix="", + issue_body="", + repository_path="", + similarity_threshold=settings.EMBEDDINGS_SIMILARITY_TRESHOLD, + max_results=settings.EMBEDDINGS_MAX_RESULTS, +): embedder_class, *embeder_args = Model.get_active_embedding_model() embeddings_results = Embedder(embedder_class, *embeder_args).retrieve_embeddings( redis_client.get(RedisVariable.ISSUE_BODY, prefix=prefix, default=issue_body), redis_client.get(RedisVariable.REPOSITORY_PATH, prefix=prefix, default=repository_path), + similarity_threshold, + max_results, ) similar_embeddings = [ (embedding.repository, embedding.file_path, embedding.text) for embedding in embeddings_results @@ -102,7 +110,10 @@ def find_embeddings_task(prefix="", issue_body="", repository_path=""): @app.task -def prepare_prompt_and_context_task(prefix="", issue_body="", embeddings=[]): +def prepare_prompt_and_context_task(prefix="", issue_body="", embeddings=None): + if not embeddings: + embeddings = [] + prompt = get_prompt(redis_client.get(RedisVariable.ISSUE_BODY, prefix=prefix, default=issue_body)) redis_client.set(RedisVariable.PROMPT, prefix=prefix, value=prompt) @@ -116,7 +127,10 @@ def prepare_prompt_and_context_task(prefix="", issue_body="", embeddings=[]): @app.task -def get_llm_response_task(prefix="", context={}): +def get_llm_response_task(prefix="", context=None): + if not context: + context = {} + context = json.loads(redis_client.get(RedisVariable.CONTEXT, prefix=prefix, default=context)) llm_response = get_llm_response(context) diff --git a/labs/tasks/repository.py b/labs/tasks/repository.py index 78fc250..1c05fcd 100644 --- a/labs/tasks/repository.py +++ b/labs/tasks/repository.py @@ -1,8 +1,8 @@ import json -import config.configuration_variables as settings from config.celery import app from decorators import time_and_log_function +from django.conf import settings from github.github import GithubRequests from parsers.response import create_file, modify_file, parse_llm_output from tasks.redis_client import RedisStrictClient, RedisVariable diff --git a/labs/tasks/run.py b/labs/tasks/run.py index d72ba71..b057b3e 100644 --- a/labs/tasks/run.py +++ b/labs/tasks/run.py @@ -1,8 +1,8 @@ import os.path -import config.configuration_variables as settings from celery import chain from config.celery import app +from django.conf import settings from tasks import ( apply_code_changes_task, clone_repository_task, From d698b8c9ce76f717c1694c1742fca329366656ff Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jo=C3=A3o=20Verdasca?= Date: Wed, 11 Dec 2024 12:05:10 +0000 Subject: [PATCH 2/3] Fix: Typo --- labs/api/schemas/codemonkey.py | 2 +- labs/config/settings.py | 2 +- labs/embeddings/embedder.py | 2 +- labs/tasks/llm.py | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/labs/api/schemas/codemonkey.py b/labs/api/schemas/codemonkey.py index 057908a..fdc0ee3 100644 --- a/labs/api/schemas/codemonkey.py +++ b/labs/api/schemas/codemonkey.py @@ -20,7 +20,7 @@ class VectorizeRepositorySchema(BaseModel): class FindEmbeddingsSchema(LocalRepositoryShema): - similarity_threshold: float = settings.EMBEDDINGS_SIMILARITY_TRESHOLD + similarity_threshold: float = settings.EMBEDDINGS_SIMILARITY_THRESHOLD max_results: int = settings.EMBEDDINGS_MAX_RESULTS diff --git a/labs/config/settings.py b/labs/config/settings.py index 346c5f2..9da8bb2 100644 --- a/labs/config/settings.py +++ b/labs/config/settings.py @@ -186,5 +186,5 @@ def create_logging_directory() -> Path: LOCAL_LLM_HOST = os.environ.get("LOCAL_LLM_HOST", "http://ollama:11434") CLONE_DESTINATION_DIR = os.getenv("CLONE_DESTINATION_DIR", "/tmp/") -EMBEDDINGS_SIMILARITY_TRESHOLD = 0.7 +EMBEDDINGS_SIMILARITY_THRESHOLD = 0.7 EMBEDDINGS_MAX_RESULTS = 10 diff --git a/labs/embeddings/embedder.py b/labs/embeddings/embedder.py index 533afe6..51ddea8 100644 --- a/labs/embeddings/embedder.py +++ b/labs/embeddings/embedder.py @@ -24,7 +24,7 @@ def retrieve_embeddings( self, query: str, repository: str, - similarity_threshold: float = settings.EMBEDDINGS_SIMILARITY_TRESHOLD, + similarity_threshold: float = settings.EMBEDDINGS_SIMILARITY_THRESHOLD, max_results: int = settings.EMBEDDINGS_MAX_RESULTS, ) -> List[Embedding]: query = query.replace("\n", "") diff --git a/labs/tasks/llm.py b/labs/tasks/llm.py index e1b4b50..9e57168 100644 --- a/labs/tasks/llm.py +++ b/labs/tasks/llm.py @@ -89,7 +89,7 @@ def find_embeddings_task( prefix="", issue_body="", repository_path="", - similarity_threshold=settings.EMBEDDINGS_SIMILARITY_TRESHOLD, + similarity_threshold=settings.EMBEDDINGS_SIMILARITY_THRESHOLD, max_results=settings.EMBEDDINGS_MAX_RESULTS, ): embedder_class, *embeder_args = Model.get_active_embedding_model() From fadf6f49dee0a57947d286f200df21c9660fab85 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jo=C3=A3o=20Verdasca?= Date: Wed, 11 Dec 2024 15:16:17 +0000 Subject: [PATCH 3/3] Fix: Solve the issue where some logs do not appeard --- labs/config/logger.py | 4 ++-- labs/config/settings.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/labs/config/logger.py b/labs/config/logger.py index b28cef4..b7cf996 100644 --- a/labs/config/logger.py +++ b/labs/config/logger.py @@ -14,6 +14,6 @@ def add_fields(self, log_record, record, message_dict): if log_record.get("level"): log_record["level"] = log_record["level"].upper() else: - log_record["level"] = record.levelname + log_record["level"] = record.levelname.upper() - log_record["project"] = "codemonkey" + log_record["project"] = "labs" diff --git a/labs/config/settings.py b/labs/config/settings.py index 9da8bb2..15e23af 100644 --- a/labs/config/settings.py +++ b/labs/config/settings.py @@ -117,13 +117,13 @@ def create_logging_directory() -> Path: "level": "DEBUG", "class": "logging.handlers.RotatingFileHandler", "filename": create_logging_directory(), - "maxBytes": 10000000, + "maxBytes": 10000000, # 10 Mb "backupCount": 5, "formatter": "json", }, }, "loggers": { - "labs": { + "root": { "level": "DEBUG", "handlers": ["console", "file"], "propagate": False,