Enhancements & Refactored Code (#10)

* Fix: Button loading state not reset on 'back' * Feat: search post processing to filter by average score * Feat: Updated api status error handling * Feat: Updated error handling & timeout duration * Refactored imports & moved global vars to constants.py * Feat: Updated dockerfile to install llama-cpp-python with openblas support by default * Add .env file and update gitignore, pipeline name, robots.txt, middleware, layout, page, sitemap, and navlink components * Fixed Pipeline Name * Updated Check File Size workflow
digitalbuiltenvironment · Jan 30, 2024 · f730525 · f730525
1 parent 73a5bd8
commit f730525
Show file tree

Hide file tree

Showing 27 changed files with 539 additions and 221 deletions.
diff --git a/.github/workflows/check-file-size.yml b/.github/workflows/check-file-size.yml
@@ -12,6 +12,8 @@ jobs:
   check-file-size:
     runs-on: ubuntu-latest
     steps:
+      - name: Checkout
+        uses: actions/checkout@v4
       - name: Check large files
         uses: ppremk/[email protected]
         with:

diff --git a/.github/workflows/pipeline.yml b/.github/workflows/pipeline.yml
@@ -1,11 +1,12 @@
-name: Test Build and Deploy
+name: Pipeline
 
 on:
   push:
     branches: [main]
   # to run this workflow manually from the Actions tab
   workflow_dispatch:
 
+# Test, Build and Deploy the app
 jobs:
   check-file-size:
     uses: ./.github/workflows/check-file-size.yml

diff --git a/Dockerfile b/Dockerfile
@@ -5,15 +5,18 @@ FROM nvidia/cuda:${CUDA_IMAGE}
 # Set up a new user named "user" with user ID 1000
 RUN useradd -m -u 1000 user
 
-# Install the dependencies
+# Install the dependencies & clean up
 RUN apt-get update && apt-get upgrade -y \
     && apt-get install -y git build-essential \
     python3.11 gcc wget \
     ocl-icd-opencl-dev opencl-headers clinfo \
     cmake protobuf-compiler pkg-config \
     libclblast-dev libopenblas-dev \
     liblapack-dev liblapacke-dev libeigen3-dev libboost-all-dev \
-    && mkdir -p /etc/OpenCL/vendors && echo "libnvidia-opencl.so.1" > /etc/OpenCL/vendors/nvidia.icd
+    && mkdir -p /etc/OpenCL/vendors && echo "libnvidia-opencl.so.1" > /etc/OpenCL/vendors/nvidia.icd \
+    # Cleaning cache:
+    && apt-get purge -y --auto-remove -o APT::AutoRemove::RecommendsImportant=false \
+    && apt-get clean -y && rm -rf /var/lib/apt/lists/*
 
 # Install pip for python 3.11
 RUN wget https://bootstrap.pypa.io/get-pip.py && \
@@ -23,21 +26,25 @@ RUN wget https://bootstrap.pypa.io/get-pip.py && \
 # Switch to the user 'user'
 USER user
 
-# Setting build related env vars
-ENV CUDA_DOCKER_ARCH=all
-ENV LLAMA_CUBLAS=1
-
-# Set home to the user's home directory and Poetry's environment variables
-ENV HOME=/home/user \
-	PATH=/home/user/.local/bin:$PATH \
+# Setting build / container related env vars
+ENV CUDA_DOCKER_ARCH=all \
+    LLAMA_CUBLAS=1 \
+    # Set home to the user's home directory and Poetry's environment variables
+    HOME=/home/user \
+    PATH=/home/user/.local/bin:$PATH \
     PYTHONUNBUFFERED=1 \
     POETRY_NO_INTERACTION=1 \
     POETRY_VIRTUALENVS_IN_PROJECT=1 \
     POETRY_VIRTUALENVS_CREATE=1 \
     POETRY_CACHE_DIR=/tmp/poetry_cache \
-    # Build llama-cpp-python with default cuda support
-    CMAKE_ARGS="-DLLAMA_CUBLAS=on"
-    # CMAKE_ARGS="-DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS"
+    # Set the uvicorn env
+    ENVIRONMENT=prod \
+    ##########################################################
+    # Build llama-cpp-python with cuda support
+    # CMAKE_ARGS="-DLLAMA_CUBLAS=on"
+    # Build llama-cpp-python with openblas support on CPU
+    CMAKE_ARGS="-DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS"
+    ##########################################################
 
 # Set the working directory to /app
 WORKDIR $HOME/app
@@ -61,4 +68,8 @@ RUN poetry install --without dev,torch-cpu && \
 # Change to the package directory
 WORKDIR $HOME/app/backend
 
+# Make port 8000 available to the world outside this container
+EXPOSE 8000
+
+# Run the app when the container launches
 CMD ["poetry", "run", "uvicorn", "main:app", "--host", "0.0.0.0"]
diff --git a/backend/.gitignore b/backend/.gitignore
@@ -1,2 +1,3 @@
 __pycache__
 storage
+.env
diff --git a/backend/backend/app/api/routers/chat.py b/backend/backend/app/api/routers/chat.py
@@ -1,8 +1,6 @@
 import logging
 from typing import List
 
-from app.utils.index import get_index
-from app.utils.json import json_to_model
 from fastapi import APIRouter, Depends, HTTPException, Request, status
 from fastapi.responses import StreamingResponse
 from fastapi.websockets import WebSocketDisconnect
@@ -13,6 +11,9 @@
 from llama_index.prompts import PromptTemplate
 from pydantic import BaseModel
 
+from backend.app.utils.index import get_index
+from backend.app.utils.json import json_to_model
+
 chat_router = r = APIRouter()
 
 """

diff --git a/backend/backend/app/api/routers/query.py b/backend/backend/app/api/routers/query.py
@@ -1,15 +1,16 @@
 import logging
 from typing import List
 
-from app.utils.index import get_index
-from app.utils.json import json_to_model
 from fastapi import APIRouter, Depends, HTTPException, Request, status
 from fastapi.responses import StreamingResponse
 from fastapi.websockets import WebSocketDisconnect
 from llama_index import VectorStoreIndex
 from llama_index.llms.types import MessageRole
 from pydantic import BaseModel
 
+from backend.app.utils.index import get_index
+from backend.app.utils.json import json_to_model
+
 query_router = r = APIRouter()
 
 """

diff --git a/backend/backend/app/api/routers/search.py b/backend/backend/app/api/routers/search.py
@@ -1,12 +1,13 @@
 import logging
 import re
 
-from app.utils.index import get_index
 from fastapi import APIRouter, Depends, HTTPException, Request, status
 from llama_index import VectorStoreIndex
 from llama_index.postprocessor import SimilarityPostprocessor
 from llama_index.retrievers import VectorIndexRetriever
 
+from backend.app.utils.index import get_index
+
 search_router = r = APIRouter()
 
 """
@@ -36,17 +37,22 @@ async def search(
         index=index,
         similarity_top_k=10,
     )
-    # similarity postprocessor: filter nodes below 0.45 similarity score
-    node_postprocessor = SimilarityPostprocessor(similarity_cutoff=0.45)
 
     # retrieve results
     query_results = retriever.retrieve(query)
 
     query_results_scores = [result.get_score() for result in query_results]
 
+    # get average score
+    average_score = sum(query_results_scores) / len(query_results_scores)
+
     logger.info(f"Search results similarity score: {query_results_scores}")
+    logger.info(f"Average similarity score: {average_score}")
+
+    # similarity postprocessor: filter nodes below 0.45 similarity score
+    node_postprocessor = SimilarityPostprocessor(similarity_cutoff=average_score)
 
-    # postprocess results
+    # postprocess results based on average score
     filtered_results = node_postprocessor.postprocess_nodes(query_results)
 
     filtered_results_scores = [result.get_score() for result in filtered_results]
@@ -68,9 +74,7 @@ async def search(
             "^_+ | _+$", "", node_dict["text"]
         )  # remove leading and trailing underscores
         data["text"] = cleaned_text
-        data["similarity_score"] = round(
-            node.get_score(), 2
-        )  # round to 2 decimal places
+        data["similarity_score"] = node.get_score()
         response.append(data)
         id += 1
 

diff --git a/backend/backend/app/utils/contants.py b/backend/backend/app/utils/contants.py
@@ -0,0 +1,39 @@
+########################################################################
+#                   Model Constants for the backend app                #
+########################################################################
+from pathlib import Path
+
+from torch.cuda import is_available as is_cuda_available
+
+# Model Constants
+MAX_NEW_TOKENS = 4096
+CONTEXT_SIZE = MAX_NEW_TOKENS
+DEVICE_TYPE = "cuda" if is_cuda_available() else "cpu"
+
+# Get the current directory
+CUR_DIR = Path.cwd()
+
+STORAGE_DIR = str(CUR_DIR / "storage")  # directory to cache the generated index
+DATA_DIR = str(CUR_DIR / "data")  # directory containing the documents to index
+
+# LLM Model Constants
+LLM_MODEL_URL = "https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGUF/resolve/main/llama-2-7b-chat.Q4_K_M.gguf"
+# Model Kwargs
+# set to at least 1 to use GPU, adjust according to your GPU memory, but must be able to fit the model
+MODEL_KWARGS = {"n_gpu_layers": 100} if DEVICE_TYPE == "cuda" else {}
+
+# Service Context Constants
+CHUNK_SIZE = 1000
+CHUNK_OVERLAP = 100
+
+# Embedding Model Constants
+EMBED_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
+EMBED_POOLING = "mean"
+
+# Prompt Helper Constants
+# set maximum input size
+CHUNK_SIZE_LIMIT = MAX_NEW_TOKENS
+# set number of output tokens
+NUM_OUTPUT = 256
+# set maximum chunk overlap
+CHUNK_OVERLAP_RATIO = 0.2
diff --git a/backend/backend/app/utils/index.py b/backend/backend/app/utils/index.py
@@ -1,6 +1,5 @@
 import logging
 import os
-from pathlib import Path
 
 from llama_index import (
     PromptHelper,
@@ -17,37 +16,34 @@
     completion_to_prompt,
     messages_to_prompt,
 )
-from torch.cuda import is_available as is_cuda_available
 
-MAX_NEW_TOKENS = 4096
-CONTEXT_SIZE = MAX_NEW_TOKENS
-MODEL_ID = "TheBloke/Llama-2-7B-Chat-GGUF"
-DEVICE_TYPE = "cuda" if is_cuda_available() else "cpu"
-
-# Get the current directory
-current_directory = Path.cwd()
-
-STORAGE_DIR = str(
-    current_directory / "storage"
-)  # directory to cache the generated index
-DATA_DIR = str(
-    current_directory / "data"
-)  # directory containing the documents to index
-
-
-# set to at least 1 to use GPU, adjust according to your GPU memory, but must be able to fit the model
-model_kwargs = {"n_gpu_layers": 100} if DEVICE_TYPE == "cuda" else {}
+from backend.app.utils.contants import (
+    CHUNK_OVERLAP,
+    CHUNK_OVERLAP_RATIO,
+    CHUNK_SIZE,
+    CHUNK_SIZE_LIMIT,
+    CONTEXT_SIZE,
+    DATA_DIR,
+    DEVICE_TYPE,
+    EMBED_MODEL_NAME,
+    EMBED_POOLING,
+    LLM_MODEL_URL,
+    MAX_NEW_TOKENS,
+    MODEL_KWARGS,
+    NUM_OUTPUT,
+    STORAGE_DIR,
+)
 
 llm = LlamaCPP(
-    model_url="https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGUF/resolve/main/llama-2-7b-chat.Q4_K_M.gguf",
+    model_url=LLM_MODEL_URL,
     temperature=0.1,
     max_new_tokens=MAX_NEW_TOKENS,
     # llama2 has a context window of 4096 tokens, but we set it lower to allow for some wiggle room
     context_window=CONTEXT_SIZE,
     # kwargs to pass to __call__()
     # generate_kwargs={},
     # kwargs to pass to __init__()
-    model_kwargs=model_kwargs,
+    model_kwargs=MODEL_KWARGS,
     # transform inputs into Llama2 format
     messages_to_prompt=messages_to_prompt,
     completion_to_prompt=completion_to_prompt,
@@ -63,22 +59,22 @@
 max_chunk_overlap = 0.2
 
 embed_model = HuggingFaceEmbedding(
-    model_name="sentence-transformers/all-MiniLM-L6-v2",
-    pooling="mean",
+    model_name=EMBED_MODEL_NAME,
+    pooling=EMBED_POOLING,
     device=DEVICE_TYPE,
 )
 
 prompt_helper = PromptHelper(
-    chunk_size_limit=4096,
-    chunk_overlap_ratio=0.2,
-    num_output=256,
+    chunk_size_limit=CHUNK_SIZE_LIMIT,
+    chunk_overlap_ratio=CHUNK_OVERLAP_RATIO,
+    num_output=NUM_OUTPUT,
 )
 
 service_context = ServiceContext.from_defaults(
     llm=llm,
     embed_model=embed_model,
-    chunk_size=1000,
-    chunk_overlap=100,
+    chunk_size=CHUNK_SIZE,
+    chunk_overlap=CHUNK_OVERLAP,
     prompt_helper=prompt_helper,
 )
 

diff --git a/backend/backend/main.py b/backend/backend/main.py
@@ -1,29 +1,31 @@
 import logging
 import os
 
-from app.api.routers.chat import chat_router
-from app.api.routers.healthcheck import healthcheck_router
-from app.api.routers.query import query_router
-from app.api.routers.search import search_router
-from app.utils.index import create_index
 from dotenv import load_dotenv
 from fastapi import FastAPI
 from fastapi.middleware.cors import CORSMiddleware
 from torch.cuda import is_available as is_cuda_available
 
+from backend.app.api.routers.chat import chat_router
+from backend.app.api.routers.healthcheck import healthcheck_router
+from backend.app.api.routers.query import query_router
+from backend.app.api.routers.search import search_router
+from backend.app.utils.index import create_index
+
 load_dotenv()
 
 app = FastAPI()
 
 environment = os.getenv("ENVIRONMENT", "dev")  # Default to 'development' if not set
 
-# TODO: Add reading allowed origins from environment variables
+# Add allowed origins from environment variables
+allowed_origins = os.getenv("ALLOWED_ORIGINS", "*")
 
 if environment == "dev":
     logger = logging.getLogger("uvicorn")
     logger.warning("Running in development mode - allowing CORS for all origins")
     app.add_middleware(
-        CORSMiddleware,
+        middleware_class=CORSMiddleware,
         allow_origins=["*"],
         allow_credentials=True,
         allow_methods=["*"],
@@ -32,19 +34,15 @@
 
 if environment == "prod":
     # In production, specify the allowed origins
-    allowed_origins = [
-        "https://your-production-domain.com",
-        "https://another-production-domain.com",
-        # Add more allowed origins as needed
-    ]
+    allowed_origins = allowed_origins.split(",") if allowed_origins != "*" else ["*"]
 
     logger = logging.getLogger("uvicorn")
     logger.info(f"Running in production mode - allowing CORS for {allowed_origins}")
     app.add_middleware(
-        CORSMiddleware,
+        middleware_class=CORSMiddleware,
         allow_origins=allowed_origins,
         allow_credentials=True,
-        allow_methods=["GET", "POST", "PUT", "DELETE"],
+        allow_methods=["GET", "POST"],
         allow_headers=["*"],
     )
 

diff --git a/backend/example.env b/backend/example.env
@@ -0,0 +1 @@
+ALLOWED_ORIGINS=http://localhost:3000
diff --git a/frontend/app/about/page.tsx b/frontend/app/about/page.tsx
@@ -3,7 +3,7 @@
 export default function About() {
 
   return (
-    <div className="rounded-xl shadow-xl p-4 mb-8 max-w-5xl w-full">
+    <div className="rounded-xl shadow-xl p-4 max-w-5xl w-full">
       <div className="max-w-2xl mx-auto p-4">
         <div className="bg-gradient-to-r from-blue-500 to-indigo-500 text-white p-8 rounded-lg shadow-lg">
           <h1 className="text-2xl md:text-4xl font-bold mb-4">About Smart Retrieval</h1>