diff --git a/.github/workflows/check-file-size.yml b/.github/workflows/check-file-size.yml index a9fe522..d127759 100644 --- a/.github/workflows/check-file-size.yml +++ b/.github/workflows/check-file-size.yml @@ -12,6 +12,8 @@ jobs: check-file-size: runs-on: ubuntu-latest steps: + - name: Checkout + uses: actions/checkout@v4 - name: Check large files uses: ppremk/lfs-warning@v3.2 with: diff --git a/.github/workflows/pipeline.yml b/.github/workflows/pipeline.yml index f76274b..399123c 100644 --- a/.github/workflows/pipeline.yml +++ b/.github/workflows/pipeline.yml @@ -1,4 +1,4 @@ -name: Test Build and Deploy +name: Pipeline on: push: @@ -6,6 +6,7 @@ on: # to run this workflow manually from the Actions tab workflow_dispatch: +# Test, Build and Deploy the app jobs: check-file-size: uses: ./.github/workflows/check-file-size.yml diff --git a/Dockerfile b/Dockerfile index bef2e7f..e2c8245 100644 --- a/Dockerfile +++ b/Dockerfile @@ -5,7 +5,7 @@ FROM nvidia/cuda:${CUDA_IMAGE} # Set up a new user named "user" with user ID 1000 RUN useradd -m -u 1000 user -# Install the dependencies +# Install the dependencies & clean up RUN apt-get update && apt-get upgrade -y \ && apt-get install -y git build-essential \ python3.11 gcc wget \ @@ -13,7 +13,10 @@ RUN apt-get update && apt-get upgrade -y \ cmake protobuf-compiler pkg-config \ libclblast-dev libopenblas-dev \ liblapack-dev liblapacke-dev libeigen3-dev libboost-all-dev \ - && mkdir -p /etc/OpenCL/vendors && echo "libnvidia-opencl.so.1" > /etc/OpenCL/vendors/nvidia.icd + && mkdir -p /etc/OpenCL/vendors && echo "libnvidia-opencl.so.1" > /etc/OpenCL/vendors/nvidia.icd \ + # Cleaning cache: + && apt-get purge -y --auto-remove -o APT::AutoRemove::RecommendsImportant=false \ + && apt-get clean -y && rm -rf /var/lib/apt/lists/* # Install pip for python 3.11 RUN wget https://bootstrap.pypa.io/get-pip.py && \ @@ -23,21 +26,25 @@ RUN wget https://bootstrap.pypa.io/get-pip.py && \ # Switch to the user 'user' USER user -# Setting build related env vars -ENV CUDA_DOCKER_ARCH=all -ENV LLAMA_CUBLAS=1 - -# Set home to the user's home directory and Poetry's environment variables -ENV HOME=/home/user \ - PATH=/home/user/.local/bin:$PATH \ +# Setting build / container related env vars +ENV CUDA_DOCKER_ARCH=all \ + LLAMA_CUBLAS=1 \ + # Set home to the user's home directory and Poetry's environment variables + HOME=/home/user \ + PATH=/home/user/.local/bin:$PATH \ PYTHONUNBUFFERED=1 \ POETRY_NO_INTERACTION=1 \ POETRY_VIRTUALENVS_IN_PROJECT=1 \ POETRY_VIRTUALENVS_CREATE=1 \ POETRY_CACHE_DIR=/tmp/poetry_cache \ - # Build llama-cpp-python with default cuda support - CMAKE_ARGS="-DLLAMA_CUBLAS=on" - # CMAKE_ARGS="-DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS" + # Set the uvicorn env + ENVIRONMENT=prod \ + ########################################################## + # Build llama-cpp-python with cuda support + # CMAKE_ARGS="-DLLAMA_CUBLAS=on" + # Build llama-cpp-python with openblas support on CPU + CMAKE_ARGS="-DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS" + ########################################################## # Set the working directory to /app WORKDIR $HOME/app @@ -61,4 +68,8 @@ RUN poetry install --without dev,torch-cpu && \ # Change to the package directory WORKDIR $HOME/app/backend +# Make port 8000 available to the world outside this container +EXPOSE 8000 + +# Run the app when the container launches CMD ["poetry", "run", "uvicorn", "main:app", "--host", "0.0.0.0"] \ No newline at end of file diff --git a/backend/.gitignore b/backend/.gitignore index 069fcb4..0fb5f02 100644 --- a/backend/.gitignore +++ b/backend/.gitignore @@ -1,2 +1,3 @@ __pycache__ storage +.env \ No newline at end of file diff --git a/backend/backend/app/api/routers/chat.py b/backend/backend/app/api/routers/chat.py index ef9910c..2a32a1d 100644 --- a/backend/backend/app/api/routers/chat.py +++ b/backend/backend/app/api/routers/chat.py @@ -1,8 +1,6 @@ import logging from typing import List -from app.utils.index import get_index -from app.utils.json import json_to_model from fastapi import APIRouter, Depends, HTTPException, Request, status from fastapi.responses import StreamingResponse from fastapi.websockets import WebSocketDisconnect @@ -13,6 +11,9 @@ from llama_index.prompts import PromptTemplate from pydantic import BaseModel +from backend.app.utils.index import get_index +from backend.app.utils.json import json_to_model + chat_router = r = APIRouter() """ diff --git a/backend/backend/app/api/routers/query.py b/backend/backend/app/api/routers/query.py index a00e7bf..2defacb 100644 --- a/backend/backend/app/api/routers/query.py +++ b/backend/backend/app/api/routers/query.py @@ -1,8 +1,6 @@ import logging from typing import List -from app.utils.index import get_index -from app.utils.json import json_to_model from fastapi import APIRouter, Depends, HTTPException, Request, status from fastapi.responses import StreamingResponse from fastapi.websockets import WebSocketDisconnect @@ -10,6 +8,9 @@ from llama_index.llms.types import MessageRole from pydantic import BaseModel +from backend.app.utils.index import get_index +from backend.app.utils.json import json_to_model + query_router = r = APIRouter() """ diff --git a/backend/backend/app/api/routers/search.py b/backend/backend/app/api/routers/search.py index b234166..938be3b 100644 --- a/backend/backend/app/api/routers/search.py +++ b/backend/backend/app/api/routers/search.py @@ -1,12 +1,13 @@ import logging import re -from app.utils.index import get_index from fastapi import APIRouter, Depends, HTTPException, Request, status from llama_index import VectorStoreIndex from llama_index.postprocessor import SimilarityPostprocessor from llama_index.retrievers import VectorIndexRetriever +from backend.app.utils.index import get_index + search_router = r = APIRouter() """ @@ -36,17 +37,22 @@ async def search( index=index, similarity_top_k=10, ) - # similarity postprocessor: filter nodes below 0.45 similarity score - node_postprocessor = SimilarityPostprocessor(similarity_cutoff=0.45) # retrieve results query_results = retriever.retrieve(query) query_results_scores = [result.get_score() for result in query_results] + # get average score + average_score = sum(query_results_scores) / len(query_results_scores) + logger.info(f"Search results similarity score: {query_results_scores}") + logger.info(f"Average similarity score: {average_score}") + + # similarity postprocessor: filter nodes below 0.45 similarity score + node_postprocessor = SimilarityPostprocessor(similarity_cutoff=average_score) - # postprocess results + # postprocess results based on average score filtered_results = node_postprocessor.postprocess_nodes(query_results) filtered_results_scores = [result.get_score() for result in filtered_results] @@ -68,9 +74,7 @@ async def search( "^_+ | _+$", "", node_dict["text"] ) # remove leading and trailing underscores data["text"] = cleaned_text - data["similarity_score"] = round( - node.get_score(), 2 - ) # round to 2 decimal places + data["similarity_score"] = node.get_score() response.append(data) id += 1 diff --git a/backend/backend/app/utils/contants.py b/backend/backend/app/utils/contants.py new file mode 100644 index 0000000..ef166b7 --- /dev/null +++ b/backend/backend/app/utils/contants.py @@ -0,0 +1,39 @@ +######################################################################## +# Model Constants for the backend app # +######################################################################## +from pathlib import Path + +from torch.cuda import is_available as is_cuda_available + +# Model Constants +MAX_NEW_TOKENS = 4096 +CONTEXT_SIZE = MAX_NEW_TOKENS +DEVICE_TYPE = "cuda" if is_cuda_available() else "cpu" + +# Get the current directory +CUR_DIR = Path.cwd() + +STORAGE_DIR = str(CUR_DIR / "storage") # directory to cache the generated index +DATA_DIR = str(CUR_DIR / "data") # directory containing the documents to index + +# LLM Model Constants +LLM_MODEL_URL = "https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGUF/resolve/main/llama-2-7b-chat.Q4_K_M.gguf" +# Model Kwargs +# set to at least 1 to use GPU, adjust according to your GPU memory, but must be able to fit the model +MODEL_KWARGS = {"n_gpu_layers": 100} if DEVICE_TYPE == "cuda" else {} + +# Service Context Constants +CHUNK_SIZE = 1000 +CHUNK_OVERLAP = 100 + +# Embedding Model Constants +EMBED_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2" +EMBED_POOLING = "mean" + +# Prompt Helper Constants +# set maximum input size +CHUNK_SIZE_LIMIT = MAX_NEW_TOKENS +# set number of output tokens +NUM_OUTPUT = 256 +# set maximum chunk overlap +CHUNK_OVERLAP_RATIO = 0.2 diff --git a/backend/backend/app/utils/index.py b/backend/backend/app/utils/index.py index 45350ee..7c59767 100644 --- a/backend/backend/app/utils/index.py +++ b/backend/backend/app/utils/index.py @@ -1,6 +1,5 @@ import logging import os -from pathlib import Path from llama_index import ( PromptHelper, @@ -17,29 +16,26 @@ completion_to_prompt, messages_to_prompt, ) -from torch.cuda import is_available as is_cuda_available -MAX_NEW_TOKENS = 4096 -CONTEXT_SIZE = MAX_NEW_TOKENS -MODEL_ID = "TheBloke/Llama-2-7B-Chat-GGUF" -DEVICE_TYPE = "cuda" if is_cuda_available() else "cpu" - -# Get the current directory -current_directory = Path.cwd() - -STORAGE_DIR = str( - current_directory / "storage" -) # directory to cache the generated index -DATA_DIR = str( - current_directory / "data" -) # directory containing the documents to index - - -# set to at least 1 to use GPU, adjust according to your GPU memory, but must be able to fit the model -model_kwargs = {"n_gpu_layers": 100} if DEVICE_TYPE == "cuda" else {} +from backend.app.utils.contants import ( + CHUNK_OVERLAP, + CHUNK_OVERLAP_RATIO, + CHUNK_SIZE, + CHUNK_SIZE_LIMIT, + CONTEXT_SIZE, + DATA_DIR, + DEVICE_TYPE, + EMBED_MODEL_NAME, + EMBED_POOLING, + LLM_MODEL_URL, + MAX_NEW_TOKENS, + MODEL_KWARGS, + NUM_OUTPUT, + STORAGE_DIR, +) llm = LlamaCPP( - model_url="https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGUF/resolve/main/llama-2-7b-chat.Q4_K_M.gguf", + model_url=LLM_MODEL_URL, temperature=0.1, max_new_tokens=MAX_NEW_TOKENS, # llama2 has a context window of 4096 tokens, but we set it lower to allow for some wiggle room @@ -47,7 +43,7 @@ # kwargs to pass to __call__() # generate_kwargs={}, # kwargs to pass to __init__() - model_kwargs=model_kwargs, + model_kwargs=MODEL_KWARGS, # transform inputs into Llama2 format messages_to_prompt=messages_to_prompt, completion_to_prompt=completion_to_prompt, @@ -63,22 +59,22 @@ max_chunk_overlap = 0.2 embed_model = HuggingFaceEmbedding( - model_name="sentence-transformers/all-MiniLM-L6-v2", - pooling="mean", + model_name=EMBED_MODEL_NAME, + pooling=EMBED_POOLING, device=DEVICE_TYPE, ) prompt_helper = PromptHelper( - chunk_size_limit=4096, - chunk_overlap_ratio=0.2, - num_output=256, + chunk_size_limit=CHUNK_SIZE_LIMIT, + chunk_overlap_ratio=CHUNK_OVERLAP_RATIO, + num_output=NUM_OUTPUT, ) service_context = ServiceContext.from_defaults( llm=llm, embed_model=embed_model, - chunk_size=1000, - chunk_overlap=100, + chunk_size=CHUNK_SIZE, + chunk_overlap=CHUNK_OVERLAP, prompt_helper=prompt_helper, ) diff --git a/backend/backend/main.py b/backend/backend/main.py index 65cb640..6f87bf9 100644 --- a/backend/backend/main.py +++ b/backend/backend/main.py @@ -1,29 +1,31 @@ import logging import os -from app.api.routers.chat import chat_router -from app.api.routers.healthcheck import healthcheck_router -from app.api.routers.query import query_router -from app.api.routers.search import search_router -from app.utils.index import create_index from dotenv import load_dotenv from fastapi import FastAPI from fastapi.middleware.cors import CORSMiddleware from torch.cuda import is_available as is_cuda_available +from backend.app.api.routers.chat import chat_router +from backend.app.api.routers.healthcheck import healthcheck_router +from backend.app.api.routers.query import query_router +from backend.app.api.routers.search import search_router +from backend.app.utils.index import create_index + load_dotenv() app = FastAPI() environment = os.getenv("ENVIRONMENT", "dev") # Default to 'development' if not set -# TODO: Add reading allowed origins from environment variables +# Add allowed origins from environment variables +allowed_origins = os.getenv("ALLOWED_ORIGINS", "*") if environment == "dev": logger = logging.getLogger("uvicorn") logger.warning("Running in development mode - allowing CORS for all origins") app.add_middleware( - CORSMiddleware, + middleware_class=CORSMiddleware, allow_origins=["*"], allow_credentials=True, allow_methods=["*"], @@ -32,19 +34,15 @@ if environment == "prod": # In production, specify the allowed origins - allowed_origins = [ - "https://your-production-domain.com", - "https://another-production-domain.com", - # Add more allowed origins as needed - ] + allowed_origins = allowed_origins.split(",") if allowed_origins != "*" else ["*"] logger = logging.getLogger("uvicorn") logger.info(f"Running in production mode - allowing CORS for {allowed_origins}") app.add_middleware( - CORSMiddleware, + middleware_class=CORSMiddleware, allow_origins=allowed_origins, allow_credentials=True, - allow_methods=["GET", "POST", "PUT", "DELETE"], + allow_methods=["GET", "POST"], allow_headers=["*"], ) diff --git a/backend/example.env b/backend/example.env new file mode 100644 index 0000000..73f9e7f --- /dev/null +++ b/backend/example.env @@ -0,0 +1 @@ +ALLOWED_ORIGINS=http://localhost:3000 \ No newline at end of file diff --git a/frontend/app/about/page.tsx b/frontend/app/about/page.tsx index 79a886b..b2fb382 100644 --- a/frontend/app/about/page.tsx +++ b/frontend/app/about/page.tsx @@ -3,7 +3,7 @@ export default function About() { return ( -